Commit b93c2205 authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Add macro to control print clock or not

parent 44443434
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#define CK_TIME_KERNEL 1 #define CK_TIME_KERNEL 1
#define ENABLE_DUMP_CLOCK 1
// constant address space for kernel parameter // constant address space for kernel parameter
// https://llvm.org/docs/AMDGPUUsage.html#address-spaces // https://llvm.org/docs/AMDGPUUsage.html#address-spaces
#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4))) #define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
......
...@@ -54,13 +54,18 @@ struct GridwiseGemmPipeline_v1<1> ...@@ -54,13 +54,18 @@ struct GridwiseGemmPipeline_v1<1>
const BBlockTransferStep& b_block_copy_step, const BBlockTransferStep& b_block_copy_step,
const BlockwiseGemm& blockwise_gemm, const BlockwiseGemm& blockwise_gemm,
CThreadBuffer& c_thread_buf, CThreadBuffer& c_thread_buf,
index_t num_loop, index_t num_loop
long& loop_start, #if ENABLE_DUMP_CLOCK
long& loop_end) , long& loop_start,
long& loop_end
#endif
)
{ {
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline start" ::); asm volatile("; [POYENC] pipeline start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
// preload data into LDS // preload data into LDS
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
...@@ -75,10 +80,12 @@ struct GridwiseGemmPipeline_v1<1> ...@@ -75,10 +80,12 @@ struct GridwiseGemmPipeline_v1<1>
a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
loop_start = __builtin_readcyclecounter(); loop_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop start" ::); asm volatile("; [POYENC] hot-loop start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
// main body // main body
if constexpr(HasMainLoop) if constexpr(HasMainLoop)
...@@ -114,6 +121,7 @@ struct GridwiseGemmPipeline_v1<1> ...@@ -114,6 +121,7 @@ struct GridwiseGemmPipeline_v1<1>
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
} }
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
loop_end = __builtin_readcyclecounter(); loop_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop end" ::); asm volatile("; [POYENC] hot-loop end" ::);
...@@ -122,6 +130,7 @@ struct GridwiseGemmPipeline_v1<1> ...@@ -122,6 +130,7 @@ struct GridwiseGemmPipeline_v1<1>
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline end" ::); asm volatile("; [POYENC] pipeline end" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
} }
}; };
...@@ -172,13 +181,18 @@ struct GridwiseGemmPipeline_v1<2> ...@@ -172,13 +181,18 @@ struct GridwiseGemmPipeline_v1<2>
const BBlockTransferStep& b_block_copy_step, const BBlockTransferStep& b_block_copy_step,
const BlockwiseGemm& blockwise_gemm, const BlockwiseGemm& blockwise_gemm,
CThreadBuffer& c_thread_buf, CThreadBuffer& c_thread_buf,
index_t num_loop, index_t num_loop
long& loop_start, #if ENABLE_DUMP_CLOCK
long& loop_end) , long& loop_start,
long& loop_end
#endif
)
{ {
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline start" ::); asm volatile("; [POYENC] pipeline start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
// preload data into LDS // preload data into LDS
{ {
...@@ -198,10 +212,12 @@ struct GridwiseGemmPipeline_v1<2> ...@@ -198,10 +212,12 @@ struct GridwiseGemmPipeline_v1<2>
// Initialize C // Initialize C
c_thread_buf.Clear(); c_thread_buf.Clear();
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
loop_start = __builtin_readcyclecounter(); loop_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop start" ::); asm volatile("; [POYENC] hot-loop start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
// main body // main body
if constexpr(HasMainLoop) if constexpr(HasMainLoop)
...@@ -282,6 +298,7 @@ struct GridwiseGemmPipeline_v1<2> ...@@ -282,6 +298,7 @@ struct GridwiseGemmPipeline_v1<2>
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
} }
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
loop_end = __builtin_readcyclecounter(); loop_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop end" ::); asm volatile("; [POYENC] hot-loop end" ::);
...@@ -290,6 +307,7 @@ struct GridwiseGemmPipeline_v1<2> ...@@ -290,6 +307,7 @@ struct GridwiseGemmPipeline_v1<2>
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline end" ::); asm volatile("; [POYENC] pipeline end" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
} }
}; };
...@@ -335,13 +353,18 @@ struct GridwiseGemmPipelineInterwave_v1<1> ...@@ -335,13 +353,18 @@ struct GridwiseGemmPipelineInterwave_v1<1>
const BBlockTransferStep& b_block_copy_step, const BBlockTransferStep& b_block_copy_step,
const BlockwiseGemm& blockwise_gemm, const BlockwiseGemm& blockwise_gemm,
CThreadBuffer& c_thread_buf, CThreadBuffer& c_thread_buf,
index_t num_loop, index_t num_loop
long& loop_start, #if ENABLE_DUMP_CLOCK
long& loop_end) , long& loop_start,
long& loop_end
#endif
)
{ {
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline start" ::); asm volatile("; [POYENC] pipeline start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
// preload data into LDS // preload data into LDS
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
...@@ -356,10 +379,12 @@ struct GridwiseGemmPipelineInterwave_v1<1> ...@@ -356,10 +379,12 @@ struct GridwiseGemmPipelineInterwave_v1<1>
a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
loop_start = __builtin_readcyclecounter(); loop_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop start" ::); asm volatile("; [POYENC] hot-loop start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
// main body // main body
if constexpr(HasMainLoop) if constexpr(HasMainLoop)
...@@ -395,6 +420,7 @@ struct GridwiseGemmPipelineInterwave_v1<1> ...@@ -395,6 +420,7 @@ struct GridwiseGemmPipelineInterwave_v1<1>
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
} }
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
loop_end = __builtin_readcyclecounter(); loop_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop end" ::); asm volatile("; [POYENC] hot-loop end" ::);
...@@ -403,6 +429,7 @@ struct GridwiseGemmPipelineInterwave_v1<1> ...@@ -403,6 +429,7 @@ struct GridwiseGemmPipelineInterwave_v1<1>
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline end" ::); asm volatile("; [POYENC] pipeline end" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
} }
}; };
......
...@@ -49,13 +49,18 @@ struct GridwiseGemmPipeline_v2 ...@@ -49,13 +49,18 @@ struct GridwiseGemmPipeline_v2
const BBlockTransferStep& b_block_copy_step, const BBlockTransferStep& b_block_copy_step,
const BlockwiseGemm& blockwise_gemm, const BlockwiseGemm& blockwise_gemm,
CThreadBuffer& c_thread_buf, CThreadBuffer& c_thread_buf,
index_t num_loop, index_t num_loop
long& loop_start, #if ENABLE_DUMP_CLOCK
long& loop_end) , long& loop_start,
long& loop_end
#endif
)
{ {
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline start" ::); asm volatile("; [POYENC] pipeline start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
// global read 0 // global read 0
a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
...@@ -78,10 +83,12 @@ struct GridwiseGemmPipeline_v2 ...@@ -78,10 +83,12 @@ struct GridwiseGemmPipeline_v2
// global Read 1 // global Read 1
b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
loop_start = __builtin_readcyclecounter(); loop_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop start" ::); asm volatile("; [POYENC] hot-loop start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
// main body // main body
if constexpr(HasMainLoop) if constexpr(HasMainLoop)
...@@ -134,6 +141,7 @@ struct GridwiseGemmPipeline_v2 ...@@ -134,6 +141,7 @@ struct GridwiseGemmPipeline_v2
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
} }
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
loop_end = __builtin_readcyclecounter(); loop_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] hot-loop end" ::); asm volatile("; [POYENC] hot-loop end" ::);
...@@ -142,6 +150,7 @@ struct GridwiseGemmPipeline_v2 ...@@ -142,6 +150,7 @@ struct GridwiseGemmPipeline_v2
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
asm volatile("; [POYENC] pipeline end" ::); asm volatile("; [POYENC] pipeline end" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
} }
}; };
......
...@@ -645,10 +645,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 ...@@ -645,10 +645,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
void* __restrict__ p_shared, void* __restrict__ p_shared,
const Argument& karg) const Argument& karg)
{ {
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
const long kernel_start = __builtin_readcyclecounter(); const long kernel_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] kernel start" ::); asm volatile("; [POYENC] kernel start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1( const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
karg.M, karg.MPadded, karg.K, karg.KPadded, karg.StrideA, karg.AK0); karg.M, karg.MPadded, karg.K, karg.KPadded, karg.StrideA, karg.AK0);
...@@ -812,7 +814,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 ...@@ -812,7 +814,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
(a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
KPerBlock); KPerBlock);
#if ENABLE_DUMP_CLOCK
long loop_start = 0, loop_end = 0; long loop_start = 0, loop_end = 0;
#endif
gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1, gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
a_block_desc_ak0_m_ak1, a_block_desc_ak0_m_ak1,
a_blockwise_copy, a_blockwise_copy,
...@@ -827,9 +831,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 ...@@ -827,9 +831,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
b_block_slice_copy_step, b_block_slice_copy_step,
blockwise_gemm, blockwise_gemm,
c_thread_buf, c_thread_buf,
num_k_block_main_loop, num_k_block_main_loop
loop_start, #if ENABLE_DUMP_CLOCK
loop_end); , loop_start, loop_end
#endif
);
// shuffle C and write out // shuffle C and write out
{ {
...@@ -1027,6 +1033,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 ...@@ -1027,6 +1033,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
} }
}); });
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
const long kernel_end = __builtin_readcyclecounter(); const long kernel_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] kernel end" ::); asm volatile("; [POYENC] kernel end" ::);
...@@ -1039,6 +1046,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1 ...@@ -1039,6 +1046,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
loop_end - loop_start, loop_end - loop_start,
kernel_end - loop_end); kernel_end - loop_end);
} }
#endif
} }
} }
}; };
......
...@@ -500,10 +500,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -500,10 +500,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
void* __restrict__ p_shared, void* __restrict__ p_shared,
const Argument& karg) const Argument& karg)
{ {
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
const long kernel_start = __builtin_readcyclecounter(); const long kernel_start = __builtin_readcyclecounter();
asm volatile("; [POYENC] kernel start" ::); asm volatile("; [POYENC] kernel start" ::);
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
#endif
const auto a_grid_desc_k0_m_k1 = const auto a_grid_desc_k0_m_k1 =
MakeAGridDescriptor_K0_M_K1(karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA); MakeAGridDescriptor_K0_M_K1(karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA);
...@@ -657,7 +659,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -657,7 +659,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
// gridwise GEMM pipeline // gridwise GEMM pipeline
const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(karg.NumKBlockLoop); const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(karg.NumKBlockLoop);
#if ENABLE_DUMP_CLOCK
long loop_start = 0, loop_end = 0; long loop_start = 0, loop_end = 0;
#endif
GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1, GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
a_block_desc_k0_m_k1, a_block_desc_k0_m_k1,
a_blockwise_copy, a_blockwise_copy,
...@@ -672,9 +676,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -672,9 +676,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
b_block_slice_copy_step, b_block_slice_copy_step,
blockwise_gemm, blockwise_gemm,
c_thread_buf, c_thread_buf,
num_k_block_main_loop, num_k_block_main_loop
loop_start, #if ENABLE_DUMP_CLOCK
loop_end); , loop_start, loop_end
#endif
);
// output: register to global memory // output: register to global memory
{ {
...@@ -753,6 +759,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -753,6 +759,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
c_grid_buf); c_grid_buf);
#if ENABLE_DUMP_CLOCK
__builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_sched_barrier(0);
const long kernel_end = __builtin_readcyclecounter(); const long kernel_end = __builtin_readcyclecounter();
asm volatile("; [POYENC] kernel end" ::); asm volatile("; [POYENC] kernel end" ::);
...@@ -765,6 +772,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 ...@@ -765,6 +772,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
loop_end - loop_start, loop_end - loop_start,
kernel_end - loop_end); kernel_end - loop_end);
} }
#endif
} }
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment