Merge branch 'develop' into ck_profiler_m_instances

b8d11559 · amd-khushbu · GitHub · 7f3fe4e7 · 3b230208 · b8d11559
Unverified Commit b8d11559 authored Feb 17, 2025 by amd-khushbu Committed by GitHub Feb 17, 2025
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
@@ -361,10 +361,18 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
        const auto M = d0_grid_desc_m_n.GetLength(I0);
        const auto N = d0_grid_desc_m_n.GetLength(I1);

-        constexpr auto mfma =
-            MfmaSelector<A0B0B1DataType, Gemm0MPerXdl, Gemm0NPerXdl>::selected_mfma;
-        constexpr auto N3 = mfma.num_groups_per_blk;
-        constexpr auto N5 = mfma.group_size;
+        constexpr bool is_single_rate_mfma =
+            ((is_same<A0B0B1DataType, half_t>::value || is_same<A0B0B1DataType, bhalf_t>::value) &&
+             math::lcm(A0K1, B0K1) <= 4)
+                ? true
+                : false;
+        constexpr auto mfma = MfmaSelector<A0B0B1DataType,
+                                           Gemm0MPerXdl,
+                                           Gemm0NPerXdl,
+                                           A0B0B1DataType,
+                                           is_single_rate_mfma>::selected_mfma;
+        constexpr auto N3   = mfma.num_groups_per_blk;
+        constexpr auto N5   = mfma.group_size;
        return transform_tensor_descriptor(
            d0_grid_desc_m_n,
            make_tuple(make_unmerge_transform(make_tuple(
@@ -643,9 +651,19 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
        //   acc1[m][o] += acc[m][n] * B1[n][o]

        // sanity check
-        constexpr index_t KPack = math::max(
-            math::lcm(A0K1, B0K1),
-            MfmaSelector<A0B0B1DataType, Gemm0MPerXdl, Gemm0NPerXdl>::selected_mfma.k_per_blk);
+        constexpr auto lcm_A0K1_B0K1 = math::lcm(A0K1, B0K1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<A0B0B1DataType, half_t>::value || is_same<A0B0B1DataType, bhalf_t>::value) &&
+             lcm_A0K1_B0K1 <= 4)
+                ? true
+                : false;
+        constexpr index_t KPack =
+            math::max(lcm_A0K1_B0K1,
+                      MfmaSelector<A0B0B1DataType,
+                                   Gemm0MPerXdl,
+                                   Gemm0NPerXdl,
+                                   A0B0B1DataType,
+                                   is_single_rate_mfma>::selected_mfma.k_per_blk);

        auto blockwise_gemm0 = BlockwiseGemmXdlops_v2<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -343,10 +343,16 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
        const auto M = d0_grid_desc_m_n.GetLength(I0);
        const auto N = d0_grid_desc_m_n.GetLength(I1);

-        constexpr auto mfma = MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma;
-        constexpr auto N3   = mfma.num_groups_per_blk;
-        constexpr auto N4   = mfma.num_input_blks;
-        constexpr auto N5   = mfma.group_size;
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             math::lcm(AK1, BK1) <= 4)
+                ? true
+                : false;
+        constexpr auto mfma =
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma;
+        constexpr auto N3 = mfma.num_groups_per_blk;
+        constexpr auto N4 = mfma.num_input_blks;
+        constexpr auto N5 = mfma.group_size;
        return transform_tensor_descriptor(
            d0_grid_desc_m_n,
            make_tuple(make_unmerge_transform(
@@ -552,8 +558,16 @@ struct GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
        //   acc1[m][o] += acc[m][n] * B1[n][o]

        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_v2<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -469,8 +469,16 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
        //   acc1[m][o] += acc[m][n] * B1[n][o]

        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_v2<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -498,8 +498,16 @@ struct GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -464,8 +464,16 @@ struct GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
@@ -599,9 +599,16 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-        constexpr index_t KPack =
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<AComputeType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<AComputeType, half_t>::value || is_same<AComputeType, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
+        constexpr index_t KPack = math::max(
+            lcm_AK1_BK1,
+            MfmaSelector<AComputeType, MPerXdl, NPerXdl, AComputeType, is_single_rate_mfma>::
+                selected_mfma.k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -451,8 +451,16 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -581,9 +581,16 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack =
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            math::max(lcm_AK1_BK1,
+                      MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
+                          selected_mfma.k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
@@ -1006,9 +1013,16 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack =
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            math::max(lcm_AK1_BK1,
+                      MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
+                          selected_mfma.k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
@@ -595,9 +595,16 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-        constexpr index_t KPack =
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<ComputeType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<ComputeType, half_t>::value || is_same<ComputeType, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
+        constexpr index_t KPack = math::max(
+            lcm_AK1_BK1,
+            MfmaSelector<ComputeType, MPerXdl, NPerXdl, ComputeType, is_single_rate_mfma>::
+                selected_mfma.k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp
@@ -79,9 +79,16 @@ struct GridwiseGemm_xdl_cshuffle_v3
    static constexpr auto AK1Number = Number<AK1Value>{};
    static constexpr auto BK1Number = Number<BK1Value>{};

+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+         lcm_AK1_BK1 <= 4)
+            ? true
+            : false;
    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number),
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -139,9 +139,16 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
    static constexpr auto AK1Number = Number<AK1Value>{};
    static constexpr auto BK1Number = Number<BK1Value>{};

+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+         lcm_AK1_BK1 <= 4)
+            ? true
+            : false;
    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number),
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
    __host__ static auto CalculateMPadded(index_t M)

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
@@ -869,9 +869,16 @@ struct GridwiseGemm_xdl_cshuffle_v2
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-        constexpr index_t KPack =
-            math::max(math::lcm(AK1Number, BK1Number),
-                      MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
+        constexpr index_t KPack = math::max(
+            lcm_AK1_BK1,
+            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                selected_mfma.k_per_blk);

        // auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
        //     BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -147,9 +147,16 @@ struct GridwiseGemm_xdl_cshuffle_v3
    static constexpr auto AK1Number = Number<AK1Value>{};
    static constexpr auto BK1Number = Number<BK1Value>{};

+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+         lcm_AK1_BK1 <= 4)
+            ? true
+            : false;
    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number),
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -155,9 +155,16 @@ struct GridwiseGemm_xdl_cshuffle_v3
    static constexpr auto AK1Number = Number<AK1Value>{};
    static constexpr auto BK1Number = Number<BK1Value>{};

+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+         lcm_AK1_BK1 <= 4)
+            ? true
+            : false;
    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number),
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;

@@ -1424,7 +1431,8 @@ struct GridwiseGemm_xdl_cshuffle_v3

        // b scale
        // static_assert(KPerBlock <= ScaleBlockK);
-        static constexpr auto mfma        = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>{};
+        static constexpr auto mfma =
+            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>{};
        static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
        static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
        static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;
@@ -1895,7 +1903,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
            KPerBlock);

        // B scale
-        static constexpr auto mfma        = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>{};
+        static constexpr auto mfma =
+            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>{};
        static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
        static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
        static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -489,8 +489,16 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -487,9 +487,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
        else if(TileMathThreadGroup::IsBelong())
        {
            // branch early for math wave
-            constexpr index_t KPack =
-                math::max(math::lcm(AK1, BK1),
-                          MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+            constexpr bool is_single_rate_mfma =
+                ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+                 lcm_AK1_BK1 <= 4)
+                    ? true
+                    : false;
+            constexpr index_t KPack = math::max(
+                lcm_AK1_BK1,
+                MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
+                    selected_mfma.k_per_blk);

            auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
                TileMathThreadGroupSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -446,8 +446,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t k_pack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,

--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -27,12 +27,12 @@
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/int8.hpp"
-#include "ck_tile/core/numeric/pk_int4.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/numeric/null_type.hpp"
 #include "ck_tile/core/numeric/numeric.hpp"
+#include "ck_tile/core/numeric/pk_int4.hpp"
 #include "ck_tile/core/numeric/type_convert.hpp"
 #include "ck_tile/core/numeric/vector_type.hpp"
 #include "ck_tile/core/tensor/buffer_view.hpp"

--- a/include/ck_tile/core/utility/transpose_vectors.hpp
+++ b/include/ck_tile/core/utility/transpose_vectors.hpp
@@ -68,52 +68,82 @@ struct transpose_vectors
        }
        else if constexpr(sizeof(S) == 1)
        {
-            static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
+            static_assert(((NX % 4 == 0 && NY % 4 == 0) || (NX % 2 == 0 && NY % 2 == 0)), "wrong!");

            using S4 = array<S, 4>; // typename array<S, 4>::type;
-
-            // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
-            static_for<0, NY, 4>{}([&](auto iy) {
-                static_for<0, NX, 4>{}([&](auto ix) {
-                    // 4 int8x4 data from vx_tuple
-                    const int32_t x_s4_0 =
-                        bit_cast<int32_t>(vx_tuple[ix].template get_as<S4>()[iy / I4]);
-                    const int32_t x_s4_1 =
-                        bit_cast<int32_t>(vx_tuple[ix + I1].template get_as<S4>()[iy / I4]);
-                    const int32_t x_s4_2 =
-                        bit_cast<int32_t>(vx_tuple[ix + I2].template get_as<S4>()[iy / I4]);
-                    const int32_t x_s4_3 =
-                        bit_cast<int32_t>(vx_tuple[ix + I3].template get_as<S4>()[iy / I4]);
-
-                    // transpose
-                    int32_t t_s4_0, t_s4_1;
-                    int32_t y_s4_0, y_s4_1, y_s4_2, y_s4_3;
-
-                    constexpr int32_t m0 = 0x05010400;
-                    constexpr int32_t m1 = 0x05040100;
-                    constexpr int32_t m2 = 0x07060302;
-                    constexpr int32_t m3 = 0x07030602;
-
-                    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
-                    //                   -- -- -- --     -- -- -- --      -  -  -  -
-                    //             index  7  6  5  4      3  2  1  0     33 77 44 88
-                    // index is reversed because of little endianness (least significant bits first)
-                    t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m0);
-                    t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m0);
-                    y_s4_0 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
-                    y_s4_1 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
-                    t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m3);
-                    t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m3);
-                    y_s4_2 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
-                    y_s4_3 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
-
-                    // 4 int8x4 data from vy_tuple
-                    vy_tuple(iy).template get_as<S4>()(ix / I4)      = bit_cast<S4>(y_s4_0);
-                    vy_tuple(iy + I1).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_1);
-                    vy_tuple(iy + I2).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_2);
-                    vy_tuple(iy + I3).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_3);
+            using S2 = array<S, 2>; // typename array<S, 4>::type;
+
+            if constexpr(NX % 4 == 0 && NY % 4 == 0)
+            {
+                // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
+                static_for<0, NY, 4>{}([&](auto iy) {
+                    static_for<0, NX, 4>{}([&](auto ix) {
+                        // 4 int8x4 data from vx_tuple
+                        const int32_t x_s4_0 =
+                            bit_cast<int32_t>(vx_tuple[ix].template get_as<S4>()[iy / I4]);
+                        const int32_t x_s4_1 =
+                            bit_cast<int32_t>(vx_tuple[ix + I1].template get_as<S4>()[iy / I4]);
+                        const int32_t x_s4_2 =
+                            bit_cast<int32_t>(vx_tuple[ix + I2].template get_as<S4>()[iy / I4]);
+                        const int32_t x_s4_3 =
+                            bit_cast<int32_t>(vx_tuple[ix + I3].template get_as<S4>()[iy / I4]);
+
+                        // transpose
+                        int32_t t_s4_0, t_s4_1;
+                        int32_t y_s4_0, y_s4_1, y_s4_2, y_s4_3;
+
+                        constexpr int32_t m0 = 0x05010400;
+                        constexpr int32_t m1 = 0x05040100;
+                        constexpr int32_t m2 = 0x07060302;
+                        constexpr int32_t m3 = 0x07030602;
+
+                        // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) ->
+                        // 0x33774488
+                        //                   -- -- -- --     -- -- -- --      -  -  -  -
+                        //             index  7  6  5  4      3  2  1  0     33 77 44 88
+                        // index is reversed because of little endianness (least significant bits
+                        // first)
+                        t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m0);
+                        t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m0);
+                        y_s4_0 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
+                        y_s4_1 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
+                        t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m3);
+                        t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m3);
+                        y_s4_2 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
+                        y_s4_3 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
+
+                        // 4 int8x4 data from vy_tuple
+                        vy_tuple(iy).template get_as<S4>()(ix / I4)      = bit_cast<S4>(y_s4_0);
+                        vy_tuple(iy + I1).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_1);
+                        vy_tuple(iy + I2).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_2);
+                        vy_tuple(iy + I3).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_3);
+                    });
                });
-            });
+            }
+            else if constexpr(NX % 2 == 0 && NY % 2 == 0)
+            {
+                static_for<0, NY, 2>{}([&](auto ix) {
+                    static_for<0, NX, 2>{}([&](auto iy) {
+                        const int16_t x_s2_0 =
+                            bit_cast<int16_t>(vx_tuple[ix].template get_as<S2>()[iy / I2]);
+                        const int16_t x_s2_1 =
+                            bit_cast<int16_t>(vx_tuple[ix + I1].template get_as<S2>()[iy / I2]);
+                        constexpr int32_t m0 = 0x05040100;
+                        constexpr int32_t m1 = 0x07060302;
+
+                        const int32_t x0_32 = static_cast<int32_t>(x_s2_0 & 0xFFFF);
+                        const int32_t x1_32 = static_cast<int32_t>(x_s2_1 & 0xFFFF);
+
+                        const int32_t y_s2_0 = __builtin_amdgcn_perm(x1_32, x0_32, m0);
+                        const int32_t y_s2_1 = __builtin_amdgcn_perm(x1_32, x0_32, m1);
+
+                        vy_tuple(iy).template get_as<S2>()[ix / I2] =
+                            bit_cast<S2>(static_cast<int16_t>(y_s2_0 & 0xFFFF));
+                        vy_tuple(iy + I1).template get_as<S2>()[ix / I2] =
+                            bit_cast<S2>(static_cast<int16_t>(y_s2_1 & 0xFFFF));
+                    });
+                });
+            }
        }
        else
        {

--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -5,6 +5,7 @@

 #include "ck_tile/host/arg_parser.hpp"
 #include "ck_tile/host/check_err.hpp"
+#include "ck_tile/host/concat.hpp"
 #include "ck_tile/host/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck_tile/host/convolution_parameter.hpp"
 #include "ck_tile/host/device_memory.hpp"