Commit e10d1043 authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Add macro to control print clock or not

parent 81ea7c0a
......@@ -10,6 +10,8 @@
#define CK_TIME_KERNEL 1
#define ENABLE_DUMP_CLOCK 1
// constant address space for kernel parameter
// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
......
......@@ -54,13 +54,19 @@ struct GridwiseGemmPipeline_v1<1>
const BBlockTransferStep& b_block_copy_step,
const BlockwiseGemm& blockwise_gemm,
CThreadBuffer& c_thread_buf,
index_t num_loop,
index_t num_loop
#if ENABLE_DUMP_CLOCK
,
long& loop_start,
long& loop_end)
long& loop_end
#endif
)
{
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
// preload data into LDS
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
......@@ -75,10 +81,12 @@ struct GridwiseGemmPipeline_v1<1>
a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
loop_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
// main body
if constexpr(HasMainLoop)
......@@ -114,6 +122,7 @@ struct GridwiseGemmPipeline_v1<1>
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
}
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
loop_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop end" ::);
......@@ -122,6 +131,7 @@ struct GridwiseGemmPipeline_v1<1>
__builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline end" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
}
};
......@@ -172,13 +182,19 @@ struct GridwiseGemmPipeline_v1<2>
const BBlockTransferStep& b_block_copy_step,
const BlockwiseGemm& blockwise_gemm,
CThreadBuffer& c_thread_buf,
index_t num_loop,
index_t num_loop
#if ENABLE_DUMP_CLOCK
,
long& loop_start,
long& loop_end)
long& loop_end
#endif
)
{
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
// preload data into LDS
{
......@@ -198,10 +214,12 @@ struct GridwiseGemmPipeline_v1<2>
// Initialize C
c_thread_buf.Clear();
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
loop_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
// main body
if constexpr(HasMainLoop)
......@@ -282,6 +300,7 @@ struct GridwiseGemmPipeline_v1<2>
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
}
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
loop_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop end" ::);
......@@ -290,6 +309,7 @@ struct GridwiseGemmPipeline_v1<2>
__builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline end" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
}
};
......@@ -335,13 +355,19 @@ struct GridwiseGemmPipelineInterwave_v1<1>
const BBlockTransferStep& b_block_copy_step,
const BlockwiseGemm& blockwise_gemm,
CThreadBuffer& c_thread_buf,
index_t num_loop,
index_t num_loop
#if ENABLE_DUMP_CLOCK
,
long& loop_start,
long& loop_end)
long& loop_end
#endif
)
{
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
// preload data into LDS
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
......@@ -356,10 +382,12 @@ struct GridwiseGemmPipelineInterwave_v1<1>
a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
loop_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
// main body
if constexpr(HasMainLoop)
......@@ -395,6 +423,7 @@ struct GridwiseGemmPipelineInterwave_v1<1>
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
}
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
loop_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop end" ::);
......@@ -403,6 +432,7 @@ struct GridwiseGemmPipelineInterwave_v1<1>
__builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline end" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
}
};
......
......@@ -49,13 +49,19 @@ struct GridwiseGemmPipeline_v2
const BBlockTransferStep& b_block_copy_step,
const BlockwiseGemm& blockwise_gemm,
CThreadBuffer& c_thread_buf,
index_t num_loop,
index_t num_loop
#if ENABLE_DUMP_CLOCK
,
long& loop_start,
long& loop_end)
long& loop_end
#endif
)
{
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
// global read 0
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
......@@ -78,10 +84,12 @@ struct GridwiseGemmPipeline_v2
// global Read 1
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
loop_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
// main body
if constexpr(HasMainLoop)
......@@ -134,6 +142,7 @@ struct GridwiseGemmPipeline_v2
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
}
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
loop_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop end" ::);
......@@ -142,6 +151,7 @@ struct GridwiseGemmPipeline_v2
__builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline end" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
}
};
......
......@@ -292,10 +292,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
c_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2CTileMap& block_2_ctile_map)
{
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
const long kernel_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] kernel start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
......@@ -441,7 +443,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
(a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
KPerBlock);
#if ENABLE_DUMP_CLOCK
long loop_start = 0, loop_end = 0;
#endif
gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
a_block_desc_ak0_m_ak1,
a_blockwise_copy,
......@@ -456,9 +460,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
b_block_slice_copy_step,
blockwise_gemm,
c_thread_buf,
num_k_block_main_loop,
num_k_block_main_loop
#if ENABLE_DUMP_CLOCK
,
loop_start,
loop_end);
loop_end
#endif
);
// shuffle C and write out
{
......@@ -656,6 +664,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
}
});
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
const long kernel_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] kernel end" ::);
......@@ -668,6 +677,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
loop_end - loop_start,
kernel_end - loop_end);
}
#endif
}
}
};
......
......@@ -331,10 +331,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
const CElementwiseOperation& c_element_op,
const Block2CTileMap& block_2_ctile_map)
{
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
const long kernel_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] kernel start" ::);
__builtin_amdgcn_sched_barrier(0);
#endif
const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
......@@ -474,7 +476,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// gridwise GEMM pipeline
const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
#if ENABLE_DUMP_CLOCK
long loop_start = 0, loop_end = 0;
#endif
GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
a_block_desc_k0_m_k1,
a_blockwise_copy,
......@@ -489,9 +493,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
b_block_slice_copy_step,
blockwise_gemm,
c_thread_buf,
num_k_block_main_loop,
num_k_block_main_loop
#if ENABLE_DUMP_CLOCK
,
loop_start,
loop_end);
loop_end
#endif
);
// output: register to global memory
{
......@@ -570,6 +578,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
c_grid_buf);
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0);
const long kernel_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] kernel end" ::);
......@@ -582,6 +591,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
loop_end - loop_start,
kernel_end - loop_end);
}
#endif
}
}
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment