use static kernel

840a617d · Wenkai · 04da3554 · 840a617d · 840a617d · 840a617d
Commit 840a617d authored Jul 04, 2022 by Wenkai
6 changed files
--- a/example/01_gemm/gemm_xdl_fp16_splitk.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_splitk.cpp
@@ -12,6 +12,7 @@
 #include "device_tensor.hpp"
 #include "device_gemm_xdl.hpp"
 #include "device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "device_gemm_xdl_splitk_c_shuffle_static.hpp"
 #include "device_gemm_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
 #include "reference_gemm.hpp"
@@ -44,6 +45,19 @@ using CElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+#if USEING_STATIC_KERNEL
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffleStatic
+//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+            //<Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        3,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
+            <Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        2,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
+            //<Row,      Col,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,    16,   128,    128,   8,   8,   16,   16,    1,    2,  S<1, 16, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 16, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           2,              S<1, 4, 1, 64>,               2>;
+            //<Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        4,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
+#else
 // clang-format off
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
 //######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
@@ -55,6 +69,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShu
 // clang-format on
+#endif  
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle_static.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle_static.hpp
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -239,6 +239,79 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
    CGridDesc_M_N c_grid_desc_m_n_;
 };
+// 2D slices of column-vectors in 3D space
+// This C-tile map dynamically adjusts M01 when C-tile index is out of range
+template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N>
+struct BlockToCTileMap_KSplit_M00_N0_M01Adapt_Static
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    __host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt_Static() = default;
+    __host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt_Static(const CGridDesc_M_N& c_grid_desc_m_n)
+        : c_grid_desc_m_n_(c_grid_desc_m_n)
+    {
+    }
+    __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+        const index_t grid_size = M0 * N0 * KSplit_;
+        return grid_size;
+    }
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        auto block_1d_id = idx_top[I0];
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I1), NPerBlock);
+#if 1
+        const index_t idx_ksplit = block_1d_id / (M0 * N0);
+        block_1d_id              = block_1d_id % (M0 * N0);
+#else
+        const index_t idx_ksplit = block_1d_id % KSplit_;
+        block_1d_id              = block_1d_id / KSplit_;
+#endif
+        index_t idx_N0 = block_1d_id % N0;
+        index_t idx_M0 = block_1d_id / N0;
+        const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_;
+        index_t idx_M00          = idx_M0 / M01_;
+        index_t idx_M01          = idx_M0 % M01_;
+        index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
+        return make_tuple(idx_ksplit,
+                          idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
+                          idx_N0_M01_local / M01_adapt);
+    }
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
+                                             const CTileDim& /* c_tile_dim */) const
+    {
+        return true; // always valid provided that user gets grid size from CalculateGridSize()
+    }
+    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
+    private:
+    static constexpr auto M01_ = Number<8>{};
+    static constexpr auto KSplit_ = Number<K_batch>{};
+    CGridDesc_M_N c_grid_desc_m_n_;
+};
 // Blocks of row-vectors
 template <index_t MPerBlock,
          index_t NPerBlock,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
@@ -139,12 +139,12 @@ struct GridwiseGemmPipeline_v2<2>
    __host__ __device__ static constexpr bool IsSupported(index_t num_loop)
    {
        // TODO: improve applicability
-        return num_loop > 2;
+        return num_loop >= 3;
    }
    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
    {
-        return num_loop > 2;
+        return num_loop >= 5;
    }
    template <bool HasMainLoop,
@@ -179,20 +179,31 @@ struct GridwiseGemmPipeline_v2<2>
                               index_t num_loop)
    {
        // global read 0
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        static_for<0, 2, 1>{}([&](auto i_pre){
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
+            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, Number<i_pre>{});
+            s_nop();
-        // move to 1
+            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, Number<i_pre>{});
+            s_nop();
+            // move to i_pre + 1
            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        });
-        // global read 1
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
        // Initialize C
        c_thread_buf.Clear();
+        // LDS write 0
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+        // global Read 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        // LDS write 0
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
+        // global Read 2
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
        index_t i = 0;
        // main body
@@ -200,20 +211,7 @@ struct GridwiseGemmPipeline_v2<2>
        {
            do
            {
-                // move to i + 2
+                static_for<0, 2, 1>{}([&](auto i_main){
-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-                // LDS write i
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
-                // global Read i + 2
-                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
-                // LDS write i
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
-                // global Read i + 2
-                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
                    block_sync_lds();
                    // GEMM i
@@ -226,30 +224,42 @@ struct GridwiseGemmPipeline_v2<2>
                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
                    // LDS write i + 1
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I1);
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<(i_main + 1) % 2>{});
                    // global read i + 3
-                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, Number<(i_main + 1) % 2>{});
                    // LDS write i + 1
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I1);
+                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<(i_main + 1) % 2>{});
                    // global read i + 3
-                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, Number<(i_main + 1) % 2>{});
-                block_sync_lds();
+                });
-                // GEMM i + 1
-                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-                block_sync_lds();
                i += 2;
-            } while(i < (num_loop - 2));
+            } while(i < (num_loop - 4));
        }
        // tail
-        if (i > num_loop - 2)
+        if (i == num_loop - 3)
        {
+            block_sync_lds();
+            // GEMM num_loop - 2
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+            block_sync_lds();
            // LDS write num_loop - 1
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I1);
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I1);
+            block_sync_lds();
+            // GEMM num_loop - 1
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+            block_sync_lds();
            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
@@ -260,27 +270,33 @@ struct GridwiseGemmPipeline_v2<2>
        }
        // tail
-        else if (i == num_loop - 2)
+        else if (i == num_loop - 4)
        {
-            // Write num_loop - 2
+            static_for<0, 4, 1>{}([&](auto i_res){
-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
-            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
                block_sync_lds();
                // GEMM num_loop - 2
                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+                if constexpr(i_res < 3)
+                {
                    block_sync_lds();
-            // LDS write num_loop - 1
+                    if constexpr(i_res < 1)
-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I1);
+                    {
-            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I1);
+                        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                    }
-            block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<(i_res + 1) % 2>{});
+                    if constexpr(i_res < 1)
+                        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
+                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<(i_res + 1) % 2>{});
+                    if constexpr(i_res < 1)
+                        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
+                }
+            });
-            // GEMM num_loop - 1
-            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
        }
    }
@@ -300,12 +316,12 @@ struct GridwiseGemmPipeline_v2<3>
    __host__ __device__ static constexpr bool IsSupported(index_t num_loop)
    {
        // TODO: improve applicability
-        return num_loop > 3;
+        return num_loop >= 4;
    }
    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
    {
-        return num_loop > 3;
+        return num_loop >= 7;
    }
    template <bool HasMainLoop,
@@ -342,8 +358,13 @@ struct GridwiseGemmPipeline_v2<3>
        static_for<0, 3, 1>{}([&](auto i_pre){
            // global read i_pre
            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, Number<i_pre>{});
+            s_nop();
            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, Number<i_pre>{});
+            s_nop();
            // move to i_pre + 1
            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
@@ -352,6 +373,16 @@ struct GridwiseGemmPipeline_v2<3>
        // Initialize C
        c_thread_buf.Clear();
+        // LDS write i_main
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+        // global Read i_main + 3
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        // LDS write i_main
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
+        // global Read i_main + 3
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
        index_t i = 0;
        // main body
@@ -360,85 +391,118 @@ struct GridwiseGemmPipeline_v2<3>
            do
            {
                static_for<0, 3, 1>{}([&](auto i_main){
+                    block_sync_lds();
-                    // LDS write i_main
+                    // GEMM i_main
-                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<i_main>{});
+                    blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
-                    // global Read i_main + 3
-                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, Number<i_main>{});
-                    // LDS write i_main
+                    block_sync_lds();
-                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<i_main>{});
-                    // global Read i_main + 3
-                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, Number<i_main>{});
                    // move to i_main + 3
                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-                    block_sync_lds();
+                    // LDS write i_main
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<(i_main + 1) % 3>{});
-                    // GEMM i_main
+                    // global Read i_main + 3
-                    blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, Number<(i_main + 1) % 3>{});
-                    block_sync_lds();
+                    // LDS write i_main
+                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<(i_main + 1) % 3>{});
+                    // global Read i_main + 3
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, Number<(i_main + 1) % 3>{});
                });
                i += 3;
-            } while(i < (num_loop - 3));
+            } while(i < (num_loop - 6));
        }
        // tail
-        if (i == num_loop - 3)
+        if (i == num_loop - 6)
        {
-            static_for<0, I3, 1>{}([&](auto i_res){
+            static_for<0, 6, 1>{}([&](auto i_res){
-                // Write num_loop - 3
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<i_res>{});
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<i_res>{});
                block_sync_lds();
                // GEMM num_loop - 3
                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+                if constexpr(i_res < 5)
+                {
                block_sync_lds();
+                if constexpr(i_res < 2)
+                {
+                // move to i_res + 3
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                }
+                // LDS write i_res
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<(i_res + 1) % 3>{});
+                if constexpr(i_res < 2)
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, Number<(i_res + 1) % 3>{});
+                // LDS write i_res
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<(i_res + 1) % 3>{});
+                if constexpr(i_res < 2)
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, Number<(i_res + 1) % 3>{});
+                }
            });
        }
        // tail
-        else if (i == num_loop - 2)
+        else if (i == num_loop - 5)
        {
-            static_for<0, I2, 1>{}([&](auto i_res){
+            static_for<0, 5, 1>{}([&](auto i_res){
-                // Write num_loop
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<i_res>{});
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<i_res>{});
                block_sync_lds();
                // GEMM num_loop
                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+                if constexpr(i_res < 4)
+                {
                block_sync_lds();
+                if constexpr(i_res < 1)
+                {
+                // move to i_res + 3
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                }
+                // LDS write i_res
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<(i_res + 1) % 3>{});
+                if constexpr(i_res < 1)
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, Number<(i_res + 1) % 3>{});
+                // LDS write i_res
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<(i_res + 1) % 3>{});
+                if constexpr(i_res < 1)
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, Number<(i_res + 1) % 3>{});
+                }
            });
        }
        // tail
-        else if (i == num_loop - 1)
+        else if (i == num_loop - 4)
        {
-            static_for<0, I1, 1>{}([&](auto i_res){
+            static_for<0, 4, 1>{}([&](auto i_res){
-                // Write num_loop
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<i_res>{});
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<i_res>{});
                block_sync_lds();
                // GEMM num_loop
                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+                if constexpr(i_res < 3)
+                {
                    block_sync_lds();
+                    // LDS write i_res
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, Number<(i_res + 1) % 3>{});
+                    // LDS write i_res
+                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, Number<(i_res + 1) % 3>{});
+                }
            });
        }
@@ -501,8 +565,13 @@ struct GridwiseGemmPipeline_v2<4>
        static_for<0, 4, 1>{}([&](auto i_pre){
            // global read i_pre
            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, Number<i_pre>{});
+            s_nop();
            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, Number<i_pre>{});
+            s_nop();
            // move to i_pre + 1
            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -47,6 +47,11 @@ __global__ void
    __shared__ FloatAB p_shared_block[shared_block_size];
+    //void* kargs_ptr = (&p_a_grid)+0x40;
+    //if(get_block_1d_id()==1&&get_thread_local_1d_id()==0)
+    //    printf("kargs=0x%p, kargs+64=%d\n", (&p_a_grid), *static_cast<int*>(kargs_ptr));
    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
                                                  p_b_grid,
                                                  p_c_grid,
@@ -124,6 +129,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
    static constexpr auto I6 = Number<6>{};
    static constexpr auto I7 = Number<7>{};
+    static constexpr auto IM = Number<M_matrix>{};
+    static constexpr auto IN = Number<N_matrix>{};
    // K1 should be Number<...>
    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
@@ -257,6 +265,18 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
    }
+    __host__ __device__ static constexpr auto
+    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock_Static(const CMNGridDesc& )
+    {
+        const auto M = IM;
+        const auto N = IN;
+        const auto MBlock = Number<M / MPerBlock>{};
+        const auto NBlock = Number<N / NPerBlock>{};
+        return make_naive_tensor_descriptor_packed(make_tuple(MBlock, Number<MPerBlock>{}, NBlock, Number<NPerBlock>{}));
+    }
    // return block_id to C matrix tile idx (m0, n0) mapping
    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
@@ -265,6 +285,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            c_m_n_grid_desc, 8, KBatch);
    }
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptorStatic(
+        const CMNGridDesc& c_m_n_grid_desc)
+    {
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt_Static<MPerBlock, NPerBlock, CMNGridDesc>(
+            c_m_n_grid_desc);
+    }
    __host__ __device__ static constexpr auto
    GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
    {
@@ -278,9 +306,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
    }
+#if USEING_STATIC_KERNEL
+    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock_Static(CMNGridDesc{}));
+    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptorStatic(CMNGridDesc{}));
+#else
    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
        decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CMNGridDesc{}));
    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
+#endif
    template <bool HasMainKBlockLoop>
    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,

--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -47,3 +47,32 @@
 #ifdef CK_USE_AMD_MFMA
 #include "amd_xdlops.hpp"
 #endif
+#define USEING_STATIC_KERNEL 1
+#define MNKB_16_1152_5120_8 0
+#define MNKB_16_5120_384_3 1
+#define MNKB_16_1280_5120_8 1
+#define MNKB_16_5120_1280_5 1
+#if MNKB_16_1152_5120_8
+#define M_matrix 16
+#define N_matrix 1152
+#define K_matrix 5120
+#define K_batch 8
+#elif MNKB_16_5120_384_3
+#define M_matrix 16
+#define N_matrix 5120
+#define K_matrix 384
+#define K_batch 4
+#elif MNKB_16_1280_5120_8
+#define M_matrix 16
+#define N_matrix 1280
+#define K_matrix 5120
+#define K_batch 8
+#elif MNKB_16_5120_1280_5
+#define M_matrix 16
+#define N_matrix 5120
+#define K_matrix 1280
+#define K_batch 5
+#endif