Add macro to control print clock or not

e10d1043 · Po-Yen, Chen · 81ea7c0a · e10d1043 · e10d1043 · e10d1043
Commit e10d1043 authored May 10, 2023 by Po-Yen, Chen
5 changed files
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -10,6 +10,8 @@

 #define CK_TIME_KERNEL 1

+#define ENABLE_DUMP_CLOCK 1
+
 // constant address space for kernel parameter
 // https://llvm.org/docs/AMDGPUUsage.html#address-spaces
 #define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
@@ -54,13 +54,19 @@ struct GridwiseGemmPipeline_v1<1>
                               const BBlockTransferStep& b_block_copy_step,
                               const BlockwiseGemm& blockwise_gemm,
                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop,
+                               index_t num_loop
+#if ENABLE_DUMP_CLOCK
+                               ,
                               long& loop_start,
-                               long& loop_end)
+                               long& loop_end
+#endif
+    )
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // preload data into LDS
        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
@@ -75,10 +81,12 @@ struct GridwiseGemmPipeline_v1<1>
        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // main body
        if constexpr(HasMainLoop)
@@ -114,6 +122,7 @@ struct GridwiseGemmPipeline_v1<1>
            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_end = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop end" ::);
@@ -122,6 +131,7 @@ struct GridwiseGemmPipeline_v1<1>
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline end" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif
    }
 };

@@ -172,13 +182,19 @@ struct GridwiseGemmPipeline_v1<2>
                               const BBlockTransferStep& b_block_copy_step,
                               const BlockwiseGemm& blockwise_gemm,
                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop,
+                               index_t num_loop
+#if ENABLE_DUMP_CLOCK
+                               ,
                               long& loop_start,
-                               long& loop_end)
+                               long& loop_end
+#endif
+    )
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // preload data into LDS
        {
@@ -198,10 +214,12 @@ struct GridwiseGemmPipeline_v1<2>
        // Initialize C
        c_thread_buf.Clear();

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // main body
        if constexpr(HasMainLoop)
@@ -282,6 +300,7 @@ struct GridwiseGemmPipeline_v1<2>
            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_end = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop end" ::);
@@ -290,6 +309,7 @@ struct GridwiseGemmPipeline_v1<2>
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline end" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif
    }
 };

@@ -335,13 +355,19 @@ struct GridwiseGemmPipelineInterwave_v1<1>
                               const BBlockTransferStep& b_block_copy_step,
                               const BlockwiseGemm& blockwise_gemm,
                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop,
+                               index_t num_loop
+#if ENABLE_DUMP_CLOCK
+                               ,
                               long& loop_start,
-                               long& loop_end)
+                               long& loop_end
+#endif
+    )
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // preload data into LDS
        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
@@ -356,10 +382,12 @@ struct GridwiseGemmPipelineInterwave_v1<1>
        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // main body
        if constexpr(HasMainLoop)
@@ -395,6 +423,7 @@ struct GridwiseGemmPipelineInterwave_v1<1>
            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_end = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop end" ::);
@@ -403,6 +432,7 @@ struct GridwiseGemmPipelineInterwave_v1<1>
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline end" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif
    }
 };


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
@@ -49,13 +49,19 @@ struct GridwiseGemmPipeline_v2
                               const BBlockTransferStep& b_block_copy_step,
                               const BlockwiseGemm& blockwise_gemm,
                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop,
+                               index_t num_loop
+#if ENABLE_DUMP_CLOCK
+                               ,
                               long& loop_start,
-                               long& loop_end)
+                               long& loop_end
+#endif
+    )
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // global read 0
        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
@@ -78,10 +84,12 @@ struct GridwiseGemmPipeline_v2
        // global Read 1
        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        // main body
        if constexpr(HasMainLoop)
@@ -134,6 +142,7 @@ struct GridwiseGemmPipeline_v2
            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }

+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        loop_end = __builtin_readcyclecounter();
        asm volatile("; [POYENC] hot-loop end" ::);
@@ -142,6 +151,7 @@ struct GridwiseGemmPipeline_v2
        __builtin_amdgcn_sched_barrier(0);
        asm volatile("; [POYENC] pipeline end" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif
    }
 };


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -292,10 +292,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
                               const Block2CTileMap& block_2_ctile_map)
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        const long kernel_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] kernel start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
@@ -441,7 +443,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
            KPerBlock);

+#if ENABLE_DUMP_CLOCK
        long loop_start = 0, loop_end = 0;
+#endif
        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
                                                               a_block_desc_ak0_m_ak1,
                                                               a_blockwise_copy,
@@ -456,9 +460,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                                                               b_block_slice_copy_step,
                                                               blockwise_gemm,
                                                               c_thread_buf,
-                                                               num_k_block_main_loop,
+                                                               num_k_block_main_loop
+#if ENABLE_DUMP_CLOCK
+                                                               ,
                                                               loop_start,
-                                                               loop_end);
+                                                               loop_end
+#endif
+        );

        // shuffle C and write out
        {
@@ -656,6 +664,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                }
            });

+#if ENABLE_DUMP_CLOCK
            __builtin_amdgcn_sched_barrier(0);
            const long kernel_end = __builtin_readcyclecounter();
            asm volatile("; [POYENC] kernel end" ::);
@@ -668,6 +677,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
                       loop_end - loop_start,
                       kernel_end - loop_end);
            }
+#endif
        }
    }
 };

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -331,10 +331,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        const CElementwiseOperation& c_element_op,
        const Block2CTileMap& block_2_ctile_map)
    {
+#if ENABLE_DUMP_CLOCK
        __builtin_amdgcn_sched_barrier(0);
        const long kernel_start = __builtin_readcyclecounter();
        asm volatile("; [POYENC] kernel start" ::);
        __builtin_amdgcn_sched_barrier(0);
+#endif

        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
@@ -474,7 +476,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
        // gridwise GEMM pipeline
        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);

+#if ENABLE_DUMP_CLOCK
        long loop_start = 0, loop_end = 0;
+#endif
        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
                                                          a_block_desc_k0_m_k1,
                                                          a_blockwise_copy,
@@ -489,9 +493,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                          b_block_slice_copy_step,
                                                          blockwise_gemm,
                                                          c_thread_buf,
-                                                          num_k_block_main_loop,
+                                                          num_k_block_main_loop
+#if ENABLE_DUMP_CLOCK
+                                                          ,
                                                          loop_start,
-                                                          loop_end);
+                                                          loop_end
+#endif
+        );

        // output: register to global memory
        {
@@ -570,6 +578,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                              c_grid_buf);

+#if ENABLE_DUMP_CLOCK
            __builtin_amdgcn_sched_barrier(0);
            const long kernel_end = __builtin_readcyclecounter();
            asm volatile("; [POYENC] kernel end" ::);
@@ -582,6 +591,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                       loop_end - loop_start,
                       kernel_end - loop_end);
            }
+#endif
        }
    }
 };