Review: Introduce AK0 and BK0 instead of a single K0

77e63b34 · Bartlomiej Wroblewski · 03cd2692 · 77e63b34 · 77e63b34 · 77e63b34
Commit 77e63b34 authored Sep 01, 2023 by Bartlomiej Wroblewski
8 changed files
--- a/example/01_gemm/gemm_dpp_fp16.cpp
+++ b/example/01_gemm/gemm_dpp_fp16.cpp
@@ -24,11 +24,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa

 // clang-format off
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDpp
-// ######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-// ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-// ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-// ######|          |          |          |            |        |        |        |            |            |            |              |      |      |      |      |   |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   128,    64,    64,     8,  8,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>;
+// ######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+// ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|    |    |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+// ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+// ######|          |          |          |            |        |        |        |            |            |            |              |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   128,    64,    64,    64,   8,   2,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               5,               1>;
 // // clang-format on

 using ReferenceGemmInstance = ck::tensor_operation::host::

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dpp.hpp
@@ -29,7 +29,7 @@ template <index_t BlockSize,
          index_t MRepeat,
          index_t NRepeat,
          index_t KPack>
-struct BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2
+struct BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2
 {
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -127,7 +127,7 @@ struct BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2
        return make_tuple(c_thread_m, c_thread_n);
    }

-    __host__ __device__ BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2()
+    __host__ __device__ BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2()
    {
        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
                          BK0NK1BlockDesc::IsKnownAtCompileTime(),

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp
@@ -33,8 +33,9 @@ template <typename ADataType,
          ck::index_t BlockSize,
          ck::index_t MPerBlock,
          ck::index_t NPerBlock,
-          ck::index_t K0PerBlock,
-          ck::index_t K1,
+          ck::index_t KPerBlock,
+          ck::index_t AK1,
+          ck::index_t BK1,
          ck::index_t MPerDpp,
          ck::index_t NPerDpp,
          ck::index_t MDppPerWave,
@@ -67,7 +68,7 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
                                         BElementwiseOperation,
                                         CElementwiseOperation>
 {
-    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_dpp<
+    using GridwiseGemm = GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp<
        BlockSize,
        ADataType,
        AccDataType,
@@ -82,10 +83,11 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
        GemmSpec,
        MPerBlock,
        NPerBlock,
-        K0PerBlock,
+        KPerBlock,
        MPerDpp,
        NPerDpp,
-        K1,
+        AK1,
+        BK1,
        MDppPerWave,
        NDppPerWave,
        ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -242,8 +244,9 @@ struct DeviceGemmDpp : public DeviceGemm<ALayout,
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock << ", "
-            << K1 << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
            << MPerDpp << ", "
            << NPerDpp << ", "
            << MDppPerWave << ", "

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
@@ -31,10 +31,10 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1030__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

-    const auto a_grid_desc_k0_m_k1 = amd_wave_read_first_lane(
-        GridwiseGemm::MakeAGridDescriptor_K0_M_K1(karg.M, karg.K, karg.K0, karg.StrideA));
-    const auto b_grid_desc_k0_n_k1 = amd_wave_read_first_lane(
-        GridwiseGemm::MakeBGridDescriptor_K0_N_K1(karg.K, karg.N, karg.K0, karg.StrideB));
+    const auto a_grid_desc_ak0_m_ak1 = amd_wave_read_first_lane(
+        GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(karg.M, karg.K, karg.AK0, karg.StrideA));
+    const auto b_grid_desc_bk0_n_bk1 = amd_wave_read_first_lane(
+        GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(karg.K, karg.N, karg.BK0, karg.StrideB));
    const auto c_grid_desc_m_n = amd_wave_read_first_lane(
        GridwiseGemm::MakeCGridDescriptor_M_N(karg.M, karg.N, karg.StrideC));

@@ -42,8 +42,8 @@ __global__ void
                                                  karg.p_b_grid,
                                                  karg.p_c_grid,
                                                  p_shared,
-                                                  a_grid_desc_k0_m_k1,
-                                                  b_grid_desc_k0_n_k1,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
                                                  c_grid_desc_m_n);
 #else
    ignore = karg;
@@ -64,10 +64,11 @@ template <index_t BlockSize,
          tensor_operation::device::GemmSpecialization GemmSpec,
          index_t MPerBlock,
          index_t NPerBlock,
-          index_t K0PerBlock,
+          index_t KPerBlock,
          index_t MPerDpp,
          index_t NPerDpp,
-          index_t K1Value,
+          index_t AK1Value,
+          index_t BK1Value,
          index_t MDppPerWave,
          index_t NDppPerWave,
          typename ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -91,7 +92,7 @@ template <index_t BlockSize,
          index_t CThreadTransferDstScalarPerVector,
          index_t NumGemmKPrefetchStage = 1,
          PipelineVersion PipelineVer   = PipelineVersion::v1>
-struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
+struct GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp
 {
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -100,7 +101,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
    static constexpr auto I4 = Number<4>{};
    static constexpr auto I5 = Number<5>{};

-    static constexpr auto K1 = Number<K1Value>{};
+    static constexpr auto AK1         = Number<AK1Value>{};
+    static constexpr auto BK1         = Number<BK1Value>{};
+    static constexpr auto AK0PerBlock = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0PerBlock = Number<KPerBlock / BK1Value>{};
+
+    static constexpr auto max_lds_align = math::lcm(AK1, BK1);

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
    // return block_id to C matrix tile idx (m0, n0) mapping
@@ -121,7 +127,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
        return math::integer_divide_ceil(N, NPerBlock) * NPerBlock;
    }

-    __host__ static auto CalculateK0(index_t K) { return math::integer_divide_floor(K, K1Value); }
+    __host__ static auto CalculateAK0(index_t K) { return math::integer_divide_floor(K, AK1Value); }
+    __host__ static auto CalculateBK0(index_t K) { return math::integer_divide_floor(K, BK1Value); }

    // Argument
    struct Problem
@@ -140,7 +147,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
              StrideC{StrideC_},
              MPadded{CalculateMPadded(M_)},
              NPadded{CalculateNPadded(N_)},
-              K0{CalculateK0(K)}
+              AK0{CalculateAK0(K)},
+              BK0{CalculateBK0(K)}
        {
        }

@@ -155,7 +163,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
                      << "SC:" << StrideC << ", "
                      << "MP:" << MPadded << ", "
                      << "NP:" << NPadded << ", "
-                      << "K0:" << K0 << "}" << std::endl;
+                      << "AK0:" << AK0 << ", "
+                      << "BK0:" << BK0 << "}" << std::endl;
        }

        index_t M;
@@ -166,7 +175,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
        index_t StrideC;
        index_t MPadded;
        index_t NPadded;
-        index_t K0;
+        index_t AK0;
+        index_t BK0;
    };

    // Argument
@@ -196,69 +206,66 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
    using GridwiseGemmPipe = remove_cvref_t<
        decltype(GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;

-    __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
    {
-        constexpr auto max_lds_align = K1;
-
        // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_block_desc_k0_m_k1 = [&]() {
+        constexpr auto a_block_desc_ak0_m_ak1 = [&]() {
            if constexpr(ABlockLdsExtraM)
            {
                return make_naive_tensor_descriptor(
-                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
-                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+                    make_tuple(Number<AK0PerBlock>{}, Number<MPerBlock>{}, AK1),
+                    make_tuple(Number<MPerBlock + 1>{} * AK1, AK1, I1));
            }
            else
            {
                return make_naive_tensor_descriptor_aligned(
-                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+                    make_tuple(Number<AK0PerBlock>{}, Number<MPerBlock>{}, AK1), max_lds_align);
            }
        }();

-        return a_block_desc_k0_m_k1;
+        return a_block_desc_ak0_m_ak1;
    }

-    __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1()
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
    {
-        constexpr auto max_lds_align = K1;
-
        // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_block_desc_k0_n_k1 = [&]() {
+        constexpr auto b_block_desc_bk0_n_bk1 = [&]() {
            if constexpr(BBlockLdsExtraN)
            {
                return make_naive_tensor_descriptor(
-                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
-                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+                    make_tuple(Number<BK0PerBlock>{}, Number<NPerBlock>{}, BK1),
+                    make_tuple(Number<NPerBlock + 1>{} * BK1, BK1, I1));
            }
            else
            {
                return make_naive_tensor_descriptor_aligned(
-                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+                    make_tuple(Number<BK0PerBlock>{}, Number<NPerBlock>{}, BK1), max_lds_align);
            }
        }();

-        return b_block_desc_k0_n_k1;
+        return b_block_desc_bk0_n_bk1;
    }

    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
    {
        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
-        constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
-        constexpr auto max_lds_align        = K1;
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();

-        constexpr auto a_block_space_size_aligned =
-            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
-        constexpr auto b_block_space_size_aligned =
-            math::integer_least_multiple(b_block_desc_k0_n_k1.GetElementSpaceSize(), max_lds_align);
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);

        return (a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(ABDataType);
    }

    __host__ static constexpr bool CheckValidity(const Problem& problem)
    {
-        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
-                      "Wrong! K1 must be known at the time of compilation.");
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(AK1)>>::value,
+                      "Wrong! AK1 must be known at the time of compilation.");
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(BK1)>>::value,
+                      "Wrong! BK1 must be known at the time of compilation.");

        static_assert(
            MPerBlock % (MPerDpp * MDppPerWave) == 0,
@@ -267,6 +274,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
            NPerBlock % (NPerDpp * NDppPerWave) == 0,
            "Invalid tuning parameters! NPerBlock must be divisible by NPerDpp * NDppPerWave.");

+        static_assert(
+            KPerBlock % AK1Value == 0 && KPerBlock % BK1Value == 0,
+            "Invalid tuning parameters! KPerBlock must be divisible by both AK1 and BK1.");
+
+        static_assert(AK1Value % ABlockTransferDstScalarPerVector_K1 == 0,
+                      "Invalid tuning parameters! AK1Value must be divisible by "
+                      "ABlockTransferDstScalarPerVector_K1");
+
+        static_assert(BK1Value % BBlockTransferDstScalarPerVector_K1 == 0,
+                      "Invalid tuning parameters! BK1Value must be divisible by "
+                      "BBlockTransferDstScalarPerVector_K1");
+
        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
@@ -319,10 +338,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
            }
        }

-        // check gridwise gemm pipeline
-        const index_t K0      = problem.K / K1;
-        const auto num_k_loop = K0 / K0PerBlock;
+        if(problem.K % KPerBlock != 0)
+        {
+            return false;
+        }

+        // check gridwise gemm pipeline
+        const auto num_k_loop = problem.K / KPerBlock;
        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
        {
            return false;
@@ -333,7 +355,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp

    __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
    {
-        const index_t num_loop = K / (K0PerBlock * K1);
+        const auto num_loop = K / KPerBlock;

        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
    }
@@ -342,30 +364,33 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
    __host__ __device__ static constexpr auto
    MakeCGridDescriptor_M0_N0_M1_N1_M2_N2(const CGridDesc& c_grid_desc_m_n)
    {
-        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
-        constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), DppSelector<ABDataType, MPerDpp, NPerDpp>::selected_dpp.k_per_dpp);

        using BlockwiseGemm =
-            BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2<BlockSize,
-                                                      ABDataType,
-                                                      AccDataType,
-                                                      decltype(a_block_desc_k0_m_k1),
-                                                      decltype(b_block_desc_k0_n_k1),
-                                                      MPerDpp,
-                                                      NPerDpp,
-                                                      MDppPerWave,
-                                                      NDppPerWave,
-                                                      K1>;
+            BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2<BlockSize,
+                                                          ABDataType,
+                                                          AccDataType,
+                                                          decltype(a_block_desc_ak0_m_ak1),
+                                                          decltype(b_block_desc_bk0_n_bk1),
+                                                          MPerDpp,
+                                                          NPerDpp,
+                                                          MDppPerWave,
+                                                          NDppPerWave,
+                                                          KPack>;

        return BlockwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_N2(c_grid_desc_m_n);
    }

    static constexpr auto matrix_padder =
        ck::tensor_operation::device::MatrixPadder<GemmSpec, index_t, index_t, index_t>{
-            MPerBlock, NPerBlock, K0PerBlock* K1};
+            MPerBlock, NPerBlock, KPerBlock};

    __device__ static auto
-    MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t K0, index_t StrideA)
+    MakeAGridDescriptor_AK0_M_AK1(index_t M, index_t K, index_t AK0, index_t StrideA)
    {
        const auto a_grid_desc_mraw_kraw = [&]() {
            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
@@ -381,14 +406,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
        const auto a_grid_desc_m_k = matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
        return transform_tensor_descriptor(
            a_grid_desc_m_k,
-            make_tuple(make_unmerge_transform(make_tuple(K0, K1Value)),
+            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
                       make_pass_through_transform(M)),
            make_tuple(Sequence<1>{}, Sequence<0>{}),
            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
    }

    __device__ static auto
-    MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t K0, index_t StrideB)
+    MakeBGridDescriptor_BK0_N_BK1(index_t K, index_t N, index_t BK0, index_t StrideB)
    {
        const auto b_grid_desc_nraw_kraw = [&]() {
            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
@@ -405,7 +430,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
        return transform_tensor_descriptor(
            b_grid_desc_n_k,
            make_tuple(make_pass_through_transform(N),
-                       make_unmerge_transform(make_tuple(K0, K1Value))),
+                       make_unmerge_transform(make_tuple(BK0, BK1Value))),
            make_tuple(Sequence<0>{}, Sequence<1>{}),
            make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
    }
@@ -427,24 +452,24 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
    }

    template <bool HasMainKBlockLoop,
-              typename AGridDesc_K0_M_K1,
-              typename BGridDesc_K0_N_K1,
+              typename AGridDesc_AK0_M_AK1,
+              typename BGridDesc_BK0_N_BK1,
              typename CGridDesc_M_N>
    __device__ static void Run(const ABDataType* __restrict__ p_a_grid,
                               const ABDataType* __restrict__ p_b_grid,
                               CDataType* __restrict__ p_c_grid,
                               void* __restrict__ p_shared,
-                               const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
-                               const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
                               const CGridDesc_M_N& c_grid_desc_m_n)
    {
        const auto c_grid_desc_m0_n0_m1_n1_m2_n2 =
            MakeCGridDescriptor_M0_N0_M1_N1_M2_N2(c_grid_desc_m_n);

        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_n2.GetElementSpaceSize());

@@ -473,25 +498,23 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
        const index_t n_block_data_idx_on_grid =
            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);

-        constexpr auto max_lds_align = K1;
-
        // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
        // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();

        auto a_blockwise_copy =
            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
                                                AElementwiseOperation,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<K0PerBlock, MPerBlock, K1>,
+                                                Sequence<AK0PerBlock, MPerBlock, AK1>,
                                                ABlockTransferThreadClusterLengths_K0_M_K1,
                                                ABlockTransferThreadClusterArrangeOrder,
                                                ABDataType,
                                                ABDataType,
-                                                decltype(a_grid_desc_k0_m_k1),
-                                                decltype(a_block_desc_k0_m_k1),
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
                                                ABlockTransferSrcAccessOrder,
                                                Sequence<1, 0, 2>,
                                                ABlockTransferSrcVectorDim,
@@ -503,10 +526,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
                                                AThreadTransferSrcResetCoordinateAfterRun,
                                                true,
                                                NumGemmKPrefetchStage>(
-                a_grid_desc_k0_m_k1,
+                a_grid_desc_ak0_m_ak1,
                make_multi_index(0, m_block_data_idx_on_grid, 0),
                a_element_op,
-                a_block_desc_k0_m_k1,
+                a_block_desc_ak0_m_ak1,
                make_multi_index(0, 0, 0),
                ck::tensor_operation::element_wise::PassThrough{});

@@ -515,13 +538,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
                                                BElementwiseOperation,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                InMemoryDataOperationEnum::Set,
-                                                Sequence<K0PerBlock, NPerBlock, K1>,
+                                                Sequence<BK0PerBlock, NPerBlock, BK1>,
                                                BBlockTransferThreadClusterLengths_K0_N_K1,
                                                BBlockTransferThreadClusterArrangeOrder,
                                                ABDataType,
                                                ABDataType,
-                                                decltype(b_grid_desc_k0_n_k1),
-                                                decltype(b_block_desc_k0_n_k1),
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
                                                BBlockTransferSrcAccessOrder,
                                                Sequence<1, 0, 2>,
                                                BBlockTransferSrcVectorDim,
@@ -533,59 +556,62 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_dpp
                                                BThreadTransferSrcResetCoordinateAfterRun,
                                                true,
                                                NumGemmKPrefetchStage>(
-                b_grid_desc_k0_n_k1,
+                b_grid_desc_bk0_n_bk1,
                make_multi_index(0, n_block_data_idx_on_grid, 0),
                b_element_op,
-                b_block_desc_k0_n_k1,
+                b_block_desc_bk0_n_bk1,
                make_multi_index(0, 0, 0),
                ck::tensor_operation::element_wise::PassThrough{});

        // GEMM definition
        //   c_mtx += transpose(a_mtx) * b_mtx
-        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
-        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     a_mtx[AK0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[BK0PerBlock, NPerBlock] is in LDS
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), DppSelector<ABDataType, MPerDpp, NPerDpp>::selected_dpp.k_per_dpp);
        auto blockwise_gemm =
-            BlockwiseGemmDpp_k0mk1_k0nk1_m0n0m1n1m2n2<BlockSize,
-                                                      ABDataType,
-                                                      AccDataType,
-                                                      decltype(a_block_desc_k0_m_k1),
-                                                      decltype(b_block_desc_k0_n_k1),
-                                                      MPerDpp,
-                                                      NPerDpp,
-                                                      MDppPerWave,
-                                                      NDppPerWave,
-                                                      K1>();
+            BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2<BlockSize,
+                                                          ABDataType,
+                                                          AccDataType,
+                                                          decltype(a_block_desc_ak0_m_ak1),
+                                                          decltype(b_block_desc_bk0_n_bk1),
+                                                          MPerDpp,
+                                                          NPerDpp,
+                                                          MDppPerWave,
+                                                          NDppPerWave,
+                                                          KPack>();

        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();

        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_space_size_aligned =
-            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);

        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ABDataType*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
+            static_cast<ABDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());

        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<ABDataType*>(p_shared) + a_block_space_size_aligned,
-            b_block_desc_k0_n_k1.GetElementSpaceSize());
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());

-        constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
+        constexpr auto a_block_slice_copy_step = make_multi_index(AK0PerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(BK0PerBlock, 0, 0);

        // gridwise GEMM pipeline
-        const auto K0                       = a_grid_desc_k0_m_k1.GetLength(I0);
-        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
+        const auto AK0 = a_grid_desc_ak0_m_ak1.GetLength(I0);
+        // (AK0 / AK0PerBlock) is always equal to (BK0 / BK0PerBlock)
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(AK0 / AK0PerBlock);

-        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
-                                                          a_block_desc_k0_m_k1,
+        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                          a_block_desc_ak0_m_ak1,
                                                          a_blockwise_copy,
                                                          a_grid_buf,
                                                          a_block_buf,
                                                          a_block_slice_copy_step,
-                                                          b_grid_desc_k0_n_k1,
-                                                          b_block_desc_k0_n_k1,
+                                                          b_grid_desc_bk0_n_bk1,
+                                                          b_block_desc_bk0_n_bk1,
                                                          b_blockwise_copy,
                                                          b_grid_buf,
                                                          b_block_buf,

--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp
@@ -30,17 +30,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
 // clang-format off
 using device_gemm_dpp_f16_f16_f16_km_kn_mn_instances = std::tuple<
-    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  8,   16,   16,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  8,   32,    8,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,     8,  8,   32,    8,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,     4,  8,   32,    8,       1,       1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  8,   32,    8,       2,       4,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     4,  8,   32,    8,       1,       4,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,     2,  8,   16,   16,       1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,      true,               5,               1>
+    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   4,   4,   16,   16,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   4,   4,   32,    8,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    64,   4,   4,   32,    8,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,    32,   4,   4,   32,    8,       1,       1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    64,   4,   4,   32,    8,       2,       4,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,    32,   4,   4,   32,    8,       1,       4,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,    16,   4,   4,   16,   16,       1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>
    >;
 // clang-format on


--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp
@@ -30,17 +30,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
 // clang-format off
 using device_gemm_dpp_f16_f16_f16_km_nk_mn_instances = std::tuple<
-    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  8,   16,   16,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  8,   32,    8,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,     8,  8,   32,    8,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,     4,  8,   32,    8,       1,       1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  8,   32,    8,       2,       4,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     4,  8,   32,    8,       1,       4,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,     2,  8,   16,   16,       1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1> 
+    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   4,   8,   16,   16,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   4,   8,   32,    8,       2,       4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    64,   4,   8,   32,    8,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,    32,   4,   8,   32,    8,       1,       1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    64,   4,   8,   32,    8,       2,       4,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,    32,   4,   8,   32,    8,       1,       4,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,    16,   4,   8,   16,   16,       1,       1,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>
    >;
 // clang-format on


--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -29,17 +29,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 // clang-format off
 using device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances = std::tuple<
-    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  8,   16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  8,   32,    8,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,     8,  8,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,     4,  8,   32,    8,       1,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  8,   32,    8,       2,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     4,  8,   32,    8,       1,       4,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,     2,  8,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,      true,               5,               1>
+    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   4,   16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   4,   32,    8,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    64,   8,   4,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,    32,   8,   4,   32,    8,       1,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    64,   8,   4,   32,    8,       2,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,    32,   8,   4,   32,    8,       1,       4,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,    16,   8,   4,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               5,               1>
    >;
 // clang-format on


--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -30,17 +30,17 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 // clang-format off
 using device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  8,   16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,     8,  8,   32,    8,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,     8,  8,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,     4,  8,   32,    8,       1,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  8,   32,    8,       2,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     4,  8,   32,    8,       1,       4,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
-    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,     2,  8,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>
+    // ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MDpp|    NDpp|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+    // ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    |  Dpp|  Dpp| PerWave| PerWave|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+    // ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+    // ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   8,   16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    64,   8,   8,   32,    8,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    64,    64,    64,   8,   8,   32,    8,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    32,    32,   8,   8,   32,    8,       1,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    64,   8,   8,   32,    8,       2,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,    32,   8,   8,   32,    8,       1,       4,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>,
+    DeviceGemmDpp< F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    16,    16,    16,   8,   8,   16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               5,               1>
    >;
 // clang-format on