Add macro to control print clock or not

b93c2205 · Po-Yen, Chen · 44443434 · b93c2205 · b93c2205 · b93c2205
Commit b93c2205 authored May 10, 2023 by Po-Yen, Chen
5 changed files
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -10,6 +10,8 @@

 #define CK_TIME_KERNEL 1

+#define ENABLE_DUMP_CLOCK 1
+
 // constant address space for kernel parameter
 // https://llvm.org/docs/AMDGPUUsage.html#address-spaces
 #define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
@@ -54,13 +54,18 @@ struct GridwiseGemmPipeline_v1<1>
                               const BBlockTransferStep& b_block_copy_step,
                               const BlockwiseGemm& blockwise_gemm,
                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop,
-                               long& loop_start,
-                               long& loop_end)
+                               index_t num_loop
+#if ENABLE_DUMP_CLOCK
+                               , long& loop_start,
+                               long& loop_end
+#endif
+                               )
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // preload data into LDS
        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
@@ -75,10 +80,12 @@ struct GridwiseGemmPipeline_v1<1>
        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // main body
        if constexpr(HasMainLoop)
@@ -114,6 +121,7 @@ struct GridwiseGemmPipeline_v1<1>
            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_end = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop end" ::);
@@ -122,6 +130,7 @@ struct GridwiseGemmPipeline_v1<1>
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline end" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif
    }
 };

@@ -172,13 +181,18 @@ struct GridwiseGemmPipeline_v1<2>
                               const BBlockTransferStep& b_block_copy_step,
                               const BlockwiseGemm& blockwise_gemm,
                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop,
-                               long& loop_start,
-                               long& loop_end)
+                               index_t num_loop
+#if ENABLE_DUMP_CLOCK
+                               , long& loop_start,
+                               long& loop_end
+#endif
+                               )
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // preload data into LDS
        {
@@ -198,10 +212,12 @@ struct GridwiseGemmPipeline_v1<2>
        // Initialize C
        c_thread_buf.Clear();

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // main body
        if constexpr(HasMainLoop)
@@ -282,6 +298,7 @@ struct GridwiseGemmPipeline_v1<2>
            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_end = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop end" ::);
@@ -290,6 +307,7 @@ struct GridwiseGemmPipeline_v1<2>
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline end" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif
    }
 };

@@ -335,13 +353,18 @@ struct GridwiseGemmPipelineInterwave_v1<1>
                               const BBlockTransferStep& b_block_copy_step,
                               const BlockwiseGemm& blockwise_gemm,
                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop,
-                               long& loop_start,
-                               long& loop_end)
+                               index_t num_loop
+#if ENABLE_DUMP_CLOCK
+                               , long& loop_start,
+                               long& loop_end
+#endif
+                               )
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // preload data into LDS
        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
@@ -356,10 +379,12 @@ struct GridwiseGemmPipelineInterwave_v1<1>
        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // main body
        if constexpr(HasMainLoop)
@@ -395,6 +420,7 @@ struct GridwiseGemmPipelineInterwave_v1<1>
            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_end = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop end" ::);
@@ -403,6 +429,7 @@ struct GridwiseGemmPipelineInterwave_v1<1>
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline end" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif
    }
 };


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
@@ -49,13 +49,18 @@ struct GridwiseGemmPipeline_v2
                               const BBlockTransferStep& b_block_copy_step,
                               const BlockwiseGemm& blockwise_gemm,
                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop,
-                               long& loop_start,
-                               long& loop_end)
+                               index_t num_loop
+#if ENABLE_DUMP_CLOCK
+                               , long& loop_start,
+                               long& loop_end
+#endif
+                               )
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // global read 0
        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
@@ -78,10 +83,12 @@ struct GridwiseGemmPipeline_v2
        // global Read 1
        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // main body
        if constexpr(HasMainLoop)
@@ -134,6 +141,7 @@ struct GridwiseGemmPipeline_v2
            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_end = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop end" ::);
@@ -142,6 +150,7 @@ struct GridwiseGemmPipeline_v2
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline end" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif
    }
 };


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -645,10 +645,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                               void* __restrict__ p_shared,
                               const Argument& karg)
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        const long kernel_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] kernel start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
            karg.M, karg.MPadded, karg.K, karg.KPadded, karg.StrideA, karg.AK0);
@@ -812,7 +814,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
            KPerBlock);

+#if ENABLE_DUMP_CLOCK
        long loop_start = 0, loop_end = 0;
+#endif
        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
                                                               a_block_desc_ak0_m_ak1,
                                                               a_blockwise_copy,
@@ -827,9 +831,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                               b_block_slice_copy_step,
                                                               blockwise_gemm,
                                                               c_thread_buf,
-                                                               num_k_block_main_loop,
-                                                               loop_start,
-                                                               loop_end);
+                                                               num_k_block_main_loop
+#if ENABLE_DUMP_CLOCK
+                                                               , loop_start, loop_end
+#endif
+                                                               );

        // shuffle C and write out
        {
@@ -1027,6 +1033,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                }
            });

+#if ENABLE_DUMP_CLOCK
            __builtin_amdgcn_sched_barrier(0);
            const long kernel_end = __builtin_readcyclecounter();
            asm volatile("; [POYENC] kernel end" ::);
@@ -1039,6 +1046,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                       loop_end - loop_start,
                       kernel_end - loop_end);
            }
+#endif
        }
    }
 };

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -500,10 +500,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                               void* __restrict__ p_shared,
                               const Argument& karg)
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        const long kernel_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] kernel start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        const auto a_grid_desc_k0_m_k1 =
            MakeAGridDescriptor_K0_M_K1(karg.M, karg.MPadded, karg.K, karg.K0, karg.StrideA);
@@ -657,7 +659,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        // gridwise GEMM pipeline
        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(karg.NumKBlockLoop);

+#if ENABLE_DUMP_CLOCK
        long loop_start = 0, loop_end = 0;
+#endif
        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
                                                          a_block_desc_k0_m_k1,
                                                          a_blockwise_copy,
@@ -672,9 +676,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                          b_block_slice_copy_step,
                                                          blockwise_gemm,
                                                          c_thread_buf,
-                                                          num_k_block_main_loop,
-                                                          loop_start,
-                                                          loop_end);
+                                                          num_k_block_main_loop
+#if ENABLE_DUMP_CLOCK
+                                                          , loop_start, loop_end
+#endif
+                                                          );

        // output: register to global memory
        {
@@ -753,6 +759,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                              c_grid_buf);

+#if ENABLE_DUMP_CLOCK
            __builtin_amdgcn_sched_barrier(0);
            const long kernel_end = __builtin_readcyclecounter();
            asm volatile("; [POYENC] kernel end" ::);
@@ -765,6 +772,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                       loop_end - loop_start,
                       kernel_end - loop_end);
            }
+#endif
        }
    }
 };