clean up

0381f287 · Chao Liu · b8d37c10 · b8d37c10 · 0381f287 · 0381f287
Commit 0381f287 authored May 07, 2022 by Chao Liu
4 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
-#pragma once
-#include "common_header.hpp"
-
-namespace ck {
-
-struct GridwiseGemmPipeline_v2
-{
-    __host__ __device__ static constexpr bool IsSupported(index_t num_loop)
-    {
-        // TODO: improve applicability
-        return num_loop % 2 == 0;
-    }
-
-    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
-    {
-        return (num_loop / 2) > 1;
-    }
-
-    template <bool HasMainLoop,
-              typename AGridDesc,
-              typename ABlockDesc,
-              typename ABlockTransfer,
-              typename AGridBuffer,
-              typename ABlockBuffer,
-              typename ABlockTransferStep,
-              typename BGridDesc,
-              typename BBlockDesc,
-              typename BBlockTransfer,
-              typename BGridBuffer,
-              typename BBlockBuffer,
-              typename BBlockTransferStep,
-              typename BlockwiseGemm,
-              typename CThreadBuffer>
-    __device__ static void Run(const AGridDesc& a_grid_desc,
-                               const ABlockDesc& a_block_desc,
-                               ABlockTransfer& a_blockwise_copy,
-                               const AGridBuffer& a_grid_buf,
-                               ABlockBuffer& a_block_buf,
-                               const ABlockTransferStep& a_block_copy_step,
-                               const BGridDesc& b_grid_desc,
-                               const BBlockDesc& b_block_desc,
-                               BBlockTransfer& b_blockwise_copy,
-                               const BGridBuffer& b_grid_buf,
-                               BBlockBuffer& b_block_buf,
-                               const BBlockTransferStep& b_block_copy_step,
-                               const BlockwiseGemm& blockwise_gemm,
-                               CThreadBuffer& c_thread_buf,
-                               index_t num_loop)
-    {
-        // global read 0
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
-
-        // move to 1
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-        // Initialize C
-        c_thread_buf.Clear();
-
-        // LDS write 0
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-        // global Read 1
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-
-        // LDS write 0
-        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
-        // global Read 1
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
-
-        // main body
-        if constexpr(HasMainLoop)
-        {
-            index_t i = 0;
-
-            do
-            {
-                block_sync_lds();
-
-                // GEMM i
-                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-
-                block_sync_lds();
-
-                // move to i + 2
-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-                // LDS write i + 1
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-                // global read i + 2
-                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-
-                // LDS write i + 1
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
-                // global read i + 2
-                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
-
-                ++i;
-            } while(i < (num_loop - 2));
-        }
-
-        // tail
-        {
-            block_sync_lds();
-
-            // GEMM num_loop - 2
-            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-
-            block_sync_lds();
-
-            // LDS write num_loop - 1
-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
-
-            block_sync_lds();
-
-            // GEMM num_loop - 1
-            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-        }
-    }
-};
-
-} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -8,7 +8,6 @@
 #include "thread_group_tensor_slice_transfer_v6r1.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
 #include "gridwise_gemm_pipeline_v1.hpp"
-#include "gridwise_gemm_pipeline_v2.hpp"

 namespace ck {

@@ -128,11 +127,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;

-#if 1
    using GridwiseGemmPipe = GridwiseGemmPipeline_v1<NumGemmKPrefetchStage>;
-#else
-    using GridwiseGemmPipe = GridwiseGemmPipeline_v2;
-#endif

    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
    {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -264,7 +264,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
        }();

        using BlockwiseGemm =
-            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<ThisThreadBlock,
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_k0_m_k1_block_desc),
@@ -452,7 +452,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4

        // B matrix blockwise copy
        auto b_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadGroup,
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
                                                BElementwiseOperation,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                InMemoryDataOperationEnum::Set,
@@ -489,7 +489,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
        // sanity check

        auto blockwise_gemm =
-            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<ThisThreadBlock,
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_k0_m_k1_block_desc),

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
-#ifndef CK_GRIDWISE_GEMM_XDLOPS_V3R1_HPP
-#define CK_GRIDWISE_GEMM_XDLOPS_V3R1_HPP
-
+#pragma once
 #include "common_header.hpp"
 #include "multi_index_transform_helper.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
 #include "blockwise_gemm_xdlops.hpp"
-#include "threadwise_tensor_slice_transfer_v4r1.hpp"
-#include "threadwise_tensor_slice_transfer_v6r1.hpp"
+#include "thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "thread_group_tensor_slice_transfer_v6r1.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
 #include "gridwise_gemm_pipeline_v1.hpp"
 #include "tensor_space_filling_curve.hpp"
@@ -476,7 +474,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);

        auto blockwise_gemm =
-            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<ThisThreadBlock,
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
                                                                FloatAB,
                                                                FloatAcc,
                                                                decltype(a_block_desc_ak0_m_ak1),
@@ -647,7 +645,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1

            // LDS to global
            auto c_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
-                BlockSize,                  // index_t BlockSize,
+                ThisThreadBlock,            // ThreadGroup
                CElementwiseOperation,      // ElementwiseOperation,
                CGlobalMemoryDataOperation, // DstInMemOp,
                Sequence<1,
@@ -748,4 +746,3 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
 };

 } // namespace ck
-#endif