Merge branch 'develop' into kylasa_1870

b75216fa · kylasa · GitHub · 610f9a34 · 3b230208 · b75216fa
Unverified Commit b75216fa authored Feb 17, 2025 by kylasa Committed by GitHub Feb 17, 2025
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
@@ -599,9 +599,16 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-        constexpr index_t KPack =
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<AComputeType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<AComputeType, half_t>::value || is_same<AComputeType, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
+        constexpr index_t KPack = math::max(
+            lcm_AK1_BK1,
+            MfmaSelector<AComputeType, MPerXdl, NPerXdl, AComputeType, is_single_rate_mfma>::
+                selected_mfma.k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -451,8 +451,16 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -581,9 +581,16 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack =
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            math::max(lcm_AK1_BK1,
+                      MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
+                          selected_mfma.k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,
@@ -1006,9 +1013,16 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack =
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            math::max(lcm_AK1_BK1,
+                      MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
+                          selected_mfma.k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle_v2.hpp
@@ -595,9 +595,16 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-        constexpr index_t KPack =
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<ComputeType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<ComputeType, half_t>::value || is_same<ComputeType, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
+        constexpr index_t KPack = math::max(
+            lcm_AK1_BK1,
+            MfmaSelector<ComputeType, MPerXdl, NPerXdl, ComputeType, is_single_rate_mfma>::
+                selected_mfma.k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp
@@ -79,9 +79,16 @@ struct GridwiseGemm_xdl_cshuffle_v3
    static constexpr auto AK1Number = Number<AK1Value>{};
    static constexpr auto BK1Number = Number<BK1Value>{};

+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+         lcm_AK1_BK1 <= 4)
+            ? true
+            : false;
    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number),
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -139,9 +139,16 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
    static constexpr auto AK1Number = Number<AK1Value>{};
    static constexpr auto BK1Number = Number<BK1Value>{};

+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+         lcm_AK1_BK1 <= 4)
+            ? true
+            : false;
    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number),
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
    __host__ static auto CalculateMPadded(index_t M)

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v2.hpp
@@ -869,9 +869,16 @@ struct GridwiseGemm_xdl_cshuffle_v2
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
-        constexpr index_t KPack =
-            math::max(math::lcm(AK1Number, BK1Number),
-                      MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
+        constexpr index_t KPack = math::max(
+            lcm_AK1_BK1,
+            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                selected_mfma.k_per_blk);

        // auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
        //     BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -147,9 +147,16 @@ struct GridwiseGemm_xdl_cshuffle_v3
    static constexpr auto AK1Number = Number<AK1Value>{};
    static constexpr auto BK1Number = Number<BK1Value>{};

+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+         lcm_AK1_BK1 <= 4)
+            ? true
+            : false;
    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number),
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -155,9 +155,16 @@ struct GridwiseGemm_xdl_cshuffle_v3
    static constexpr auto AK1Number = Number<AK1Value>{};
    static constexpr auto BK1Number = Number<BK1Value>{};

+    static constexpr auto lcm_AK1_BK1 = math::lcm(AK1Number, BK1Number);
+    static constexpr bool is_single_rate_mfma =
+        ((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
+         lcm_AK1_BK1 <= 4)
+            ? true
+            : false;
    static constexpr index_t KPack =
-        math::max(math::lcm(AK1Number, BK1Number),
-                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        math::max(lcm_AK1_BK1,
+                  MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>::
+                      selected_mfma.k_per_blk);

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;

@@ -1424,7 +1431,8 @@ struct GridwiseGemm_xdl_cshuffle_v3

        // b scale
        // static_assert(KPerBlock <= ScaleBlockK);
-        static constexpr auto mfma        = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>{};
+        static constexpr auto mfma =
+            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>{};
        static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
        static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
        static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;
@@ -1895,7 +1903,8 @@ struct GridwiseGemm_xdl_cshuffle_v3
            KPerBlock);

        // B scale
-        static constexpr auto mfma        = MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl>{};
+        static constexpr auto mfma =
+            MfmaSelector<ComputeTypeA, MPerXdl, NPerXdl, ComputeTypeA, is_single_rate_mfma>{};
        static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
        static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
        static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -489,8 +489,16 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t KPack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
            BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -487,9 +487,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
        else if(TileMathThreadGroup::IsBelong())
        {
            // branch early for math wave
-            constexpr index_t KPack =
-                math::max(math::lcm(AK1, BK1),
-                          MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+            constexpr bool is_single_rate_mfma =
+                ((is_same<ABDataType, half_t>::value || is_same<ABDataType, bhalf_t>::value) &&
+                 lcm_AK1_BK1 <= 4)
+                    ? true
+                    : false;
+            constexpr index_t KPack = math::max(
+                lcm_AK1_BK1,
+                MfmaSelector<ABDataType, MPerXdl, NPerXdl, ABDataType, is_single_rate_mfma>::
+                    selected_mfma.k_per_blk);

            auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<
                TileMathThreadGroupSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -446,8 +446,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
        //       register
        // sanity check
+        constexpr auto lcm_AK1_BK1 = math::lcm(AK1, BK1);
+        constexpr bool is_single_rate_mfma =
+            ((is_same<FloatAB, half_t>::value || is_same<FloatAB, bhalf_t>::value) &&
+             lcm_AK1_BK1 <= 4)
+                ? true
+                : false;
        constexpr index_t k_pack = math::max(
-            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+            lcm_AK1_BK1,
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl, FloatAB, is_single_rate_mfma>::selected_mfma
+                .k_per_blk);

        auto blockwise_gemm =
            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,

--- a/include/ck_tile/core/utility/transpose_vectors.hpp
+++ b/include/ck_tile/core/utility/transpose_vectors.hpp
@@ -68,52 +68,82 @@ struct transpose_vectors
        }
        else if constexpr(sizeof(S) == 1)
        {
-            static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
+            static_assert(((NX % 4 == 0 && NY % 4 == 0) || (NX % 2 == 0 && NY % 2 == 0)), "wrong!");

            using S4 = array<S, 4>; // typename array<S, 4>::type;
-
-            // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
-            static_for<0, NY, 4>{}([&](auto iy) {
-                static_for<0, NX, 4>{}([&](auto ix) {
-                    // 4 int8x4 data from vx_tuple
-                    const int32_t x_s4_0 =
-                        bit_cast<int32_t>(vx_tuple[ix].template get_as<S4>()[iy / I4]);
-                    const int32_t x_s4_1 =
-                        bit_cast<int32_t>(vx_tuple[ix + I1].template get_as<S4>()[iy / I4]);
-                    const int32_t x_s4_2 =
-                        bit_cast<int32_t>(vx_tuple[ix + I2].template get_as<S4>()[iy / I4]);
-                    const int32_t x_s4_3 =
-                        bit_cast<int32_t>(vx_tuple[ix + I3].template get_as<S4>()[iy / I4]);
-
-                    // transpose
-                    int32_t t_s4_0, t_s4_1;
-                    int32_t y_s4_0, y_s4_1, y_s4_2, y_s4_3;
-
-                    constexpr int32_t m0 = 0x05010400;
-                    constexpr int32_t m1 = 0x05040100;
-                    constexpr int32_t m2 = 0x07060302;
-                    constexpr int32_t m3 = 0x07030602;
-
-                    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
-                    //                   -- -- -- --     -- -- -- --      -  -  -  -
-                    //             index  7  6  5  4      3  2  1  0     33 77 44 88
-                    // index is reversed because of little endianness (least significant bits first)
-                    t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m0);
-                    t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m0);
-                    y_s4_0 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
-                    y_s4_1 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
-                    t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m3);
-                    t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m3);
-                    y_s4_2 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
-                    y_s4_3 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
-
-                    // 4 int8x4 data from vy_tuple
-                    vy_tuple(iy).template get_as<S4>()(ix / I4)      = bit_cast<S4>(y_s4_0);
-                    vy_tuple(iy + I1).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_1);
-                    vy_tuple(iy + I2).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_2);
-                    vy_tuple(iy + I3).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_3);
+            using S2 = array<S, 2>; // typename array<S, 4>::type;
+
+            if constexpr(NX % 4 == 0 && NY % 4 == 0)
+            {
+                // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
+                static_for<0, NY, 4>{}([&](auto iy) {
+                    static_for<0, NX, 4>{}([&](auto ix) {
+                        // 4 int8x4 data from vx_tuple
+                        const int32_t x_s4_0 =
+                            bit_cast<int32_t>(vx_tuple[ix].template get_as<S4>()[iy / I4]);
+                        const int32_t x_s4_1 =
+                            bit_cast<int32_t>(vx_tuple[ix + I1].template get_as<S4>()[iy / I4]);
+                        const int32_t x_s4_2 =
+                            bit_cast<int32_t>(vx_tuple[ix + I2].template get_as<S4>()[iy / I4]);
+                        const int32_t x_s4_3 =
+                            bit_cast<int32_t>(vx_tuple[ix + I3].template get_as<S4>()[iy / I4]);
+
+                        // transpose
+                        int32_t t_s4_0, t_s4_1;
+                        int32_t y_s4_0, y_s4_1, y_s4_2, y_s4_3;
+
+                        constexpr int32_t m0 = 0x05010400;
+                        constexpr int32_t m1 = 0x05040100;
+                        constexpr int32_t m2 = 0x07060302;
+                        constexpr int32_t m3 = 0x07030602;
+
+                        // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) ->
+                        // 0x33774488
+                        //                   -- -- -- --     -- -- -- --      -  -  -  -
+                        //             index  7  6  5  4      3  2  1  0     33 77 44 88
+                        // index is reversed because of little endianness (least significant bits
+                        // first)
+                        t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m0);
+                        t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m0);
+                        y_s4_0 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
+                        y_s4_1 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
+                        t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m3);
+                        t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m3);
+                        y_s4_2 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
+                        y_s4_3 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
+
+                        // 4 int8x4 data from vy_tuple
+                        vy_tuple(iy).template get_as<S4>()(ix / I4)      = bit_cast<S4>(y_s4_0);
+                        vy_tuple(iy + I1).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_1);
+                        vy_tuple(iy + I2).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_2);
+                        vy_tuple(iy + I3).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_3);
+                    });
                });
-            });
+            }
+            else if constexpr(NX % 2 == 0 && NY % 2 == 0)
+            {
+                static_for<0, NY, 2>{}([&](auto ix) {
+                    static_for<0, NX, 2>{}([&](auto iy) {
+                        const int16_t x_s2_0 =
+                            bit_cast<int16_t>(vx_tuple[ix].template get_as<S2>()[iy / I2]);
+                        const int16_t x_s2_1 =
+                            bit_cast<int16_t>(vx_tuple[ix + I1].template get_as<S2>()[iy / I2]);
+                        constexpr int32_t m0 = 0x05040100;
+                        constexpr int32_t m1 = 0x07060302;
+
+                        const int32_t x0_32 = static_cast<int32_t>(x_s2_0 & 0xFFFF);
+                        const int32_t x1_32 = static_cast<int32_t>(x_s2_1 & 0xFFFF);
+
+                        const int32_t y_s2_0 = __builtin_amdgcn_perm(x1_32, x0_32, m0);
+                        const int32_t y_s2_1 = __builtin_amdgcn_perm(x1_32, x0_32, m1);
+
+                        vy_tuple(iy).template get_as<S2>()[ix / I2] =
+                            bit_cast<S2>(static_cast<int16_t>(y_s2_0 & 0xFFFF));
+                        vy_tuple(iy + I1).template get_as<S2>()[ix / I2] =
+                            bit_cast<S2>(static_cast<int16_t>(y_s2_1 & 0xFFFF));
+                    });
+                });
+            }
        }
        else
        {

--- a/include/ck_tile/ops/fmha/block/block_masking.hpp
+++ b/include/ck_tile/ops/fmha/block/block_masking.hpp
@@ -310,7 +310,7 @@ struct SimplifiedGenericAttentionMask

        const index_t x_per_split = ck_tile::max(1, integer_divide_ceil(x_total, num_splits));
        const index_t split_start = x_per_split * i_split;
-        const index_t split_end   = split_start + x_per_split;
+        const index_t split_end   = ck_tile::min(x_total, split_start + x_per_split);

        return ck_tile::make_tuple(ck_tile::max(origin_start, split_start),
                                   ck_tile::min(origin_end, split_end));

--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -742,7 +742,7 @@ struct FmhaFwdSplitKVKernel
                return pad_tensor_view(
                    v_dram_transposed,
                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, false>{});
+                    sequence<kPadHeadDimV, kPadSeqLenK>{});
            }
            else
            {

--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
@@ -343,6 +343,8 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
        // moving k_dram_window is an in-page-block operation, so there is
        // no need to invoke k_page_block_navigator.move_tile_window() here.
        move_tile_window(k_dram_window, {0, kK0});
+        // ensure LDS access by Q is done before the over-writting by K
+        block_sync_lds();
        store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));

        do

--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -29,6 +29,8 @@
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"

--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
@@ -14,24 +14,54 @@ namespace ck_tile {
 template <typename Problem_, typename Policy_ = BlockGemmARegBRegCRegV1DefaultPolicy>
 struct BlockGemmARegBRegCRegV1
 {
-    using Problem        = remove_cvref_t<Problem_>;
-    using Policy         = remove_cvref_t<Policy_>;
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
-    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
-
-    static constexpr index_t kBlockSize   = Problem::kBlockSize;
-    static constexpr index_t MPerBlock    = BlockGemmShape::kM;
-    static constexpr index_t NPerBlock    = BlockGemmShape::kN;
-    static constexpr index_t KPerBlock    = BlockGemmShape::kK;
-    static constexpr auto config          = Policy::template GetWarpGemmMWarpNWarp<Problem>();
-    using WG                              = remove_cvref_t<decltype(config.template at<0>())>;
-    static constexpr index_t MWarp        = config.template at<1>();
-    static constexpr index_t NWarp        = config.template at<2>();
-    static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
-    static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
-    static constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+    private:
+    template <typename PipelineProblem_, typename GemmPolicy_>
+    struct GemmTraits_
+    {
+        using Problem        = remove_cvref_t<PipelineProblem_>;
+        using Policy         = remove_cvref_t<GemmPolicy_>;
+        using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+        using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+        using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+        using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+        static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+        static constexpr index_t MPerBlock = BlockGemmShape::kM;
+        static constexpr index_t NPerBlock = BlockGemmShape::kN;
+        static constexpr index_t KPerBlock = BlockGemmShape::kK;
+
+        static constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm               = remove_cvref_t<decltype(config.template at<0>())>;
+
+        static constexpr index_t MWarp        = config.template at<1>();
+        static constexpr index_t NWarp        = config.template at<2>();
+        static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
+        static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
+        static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
+
+        static constexpr index_t KPack = WarpGemm::kKPerThread;
+    };
+
+    public:
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using Traits = GemmTraits_<Problem, Policy>;
+
+    using WarpGemm       = typename Traits::WarpGemm;
+    using BlockGemmShape = typename Traits::BlockGemmShape;
+
+    using ADataType = remove_cvref_t<typename Traits::ADataType>;
+    using BDataType = remove_cvref_t<typename Traits::BDataType>;
+    using CDataType = remove_cvref_t<typename Traits::CDataType>;
+
+    static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
+    static constexpr index_t MIterPerWarp = Traits::MIterPerWarp;
+    static constexpr index_t NIterPerWarp = Traits::NIterPerWarp;
+
+    static constexpr index_t MWarp = Traits::MWarp;
+    static constexpr index_t NWarp = Traits::NWarp;

    CK_TILE_DEVICE static constexpr auto MakeABlockDistributionEncode()
    {
@@ -43,7 +73,7 @@ struct BlockGemmARegBRegCRegV1
                                       sequence<1, 2>,
                                       sequence<0, 0>>{};
        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
+            a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});

        return a_block_dstr_encode;
    }
@@ -58,7 +88,7 @@ struct BlockGemmARegBRegCRegV1
                                       sequence<1, 2>,
                                       sequence<0, 0>>{};
        constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            b_block_outer_dstr_encoding, typename WG::BWarpDstrEncoding{});
+            b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});

        return b_block_dstr_encode;
    }
@@ -73,7 +103,7 @@ struct BlockGemmARegBRegCRegV1
            sequence<1, 2>,
            sequence<0, 0>>{};
        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});

        return c_block_dstr_encode;
    }
@@ -112,13 +142,13 @@ struct BlockGemmARegBRegCRegV1
                                                       .get_static_tile_distribution_encoding())>>,
            "C distribution is wrong!");

-        using AWarpDstr = typename WG::AWarpDstr;
-        using BWarpDstr = typename WG::BWarpDstr;
-        using CWarpDstr = typename WG::CWarpDstr;
+        using AWarpDstr = typename WarpGemm::AWarpDstr;
+        using BWarpDstr = typename WarpGemm::BWarpDstr;
+        using CWarpDstr = typename WarpGemm::CWarpDstr;

-        using AWarpTensor = typename WG::AWarpTensor;
-        using BWarpTensor = typename WG::BWarpTensor;
-        using CWarpTensor = typename WG::CWarpTensor;
+        using AWarpTensor = typename WarpGemm::AWarpTensor;
+        using BWarpTensor = typename WarpGemm::BWarpTensor;
+        using CWarpTensor = typename WarpGemm::CWarpTensor;

        constexpr auto a_warp_y_lengths =
            to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
@@ -157,7 +187,7 @@ struct BlockGemmARegBRegCRegV1
                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));

                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                    WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);

                    // write C warp tensor into C block tensor
                    c_block_tensor.set_y_sliced_thread_data(
@@ -180,7 +210,7 @@ struct BlockGemmARegBRegCRegV1
            sequence<0, 0>>{};

        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
        return c_block_tensor;

--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -463,7 +463,9 @@ struct GemmKernel
     * @param a_ptr input A pointer
     * @param b_ptr input B pointer
     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
     *
@@ -473,7 +475,7 @@ struct GemmKernel
    CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
                                       const BDataType* b_ptr,
                                       CDataType* c_ptr,
-                                       void* smem_ptr,
+                                       void* smem_ptr_0,
                                       const GemmKernelArgs& kargs,
                                       const SplitKBatchOffset& splitk_batch_offset,
                                       const index_t block_idx_m,
@@ -491,15 +493,67 @@ struct GemmKernel
        // Run GEMM cooperatively by whole workgroup.
        const auto& a_block_window = gemm_tile_windows.at(I0);
        const auto& b_block_window = gemm_tile_windows.at(I1);
-        const auto& c_block_tile =
-            GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0);
+
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I2);
+
+        EpiloguePipeline{}
+            .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
+                c_block_window, c_block_tile, smem_ptr_0);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The starting pointer of 1st shared memory block.
+     * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset Utility structure used to calculate k batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     * @tparam DstInMemOp Destination memory operation (default: set).
+     */
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static void RunGemm2LDS(const ADataType* a_ptr,
+                                           const BDataType* b_ptr,
+                                           CDataType* c_ptr,
+                                           void* __restrict__ smem_ptr_0,
+                                           void* __restrict__ smem_ptr_1,
+                                           const GemmKernelArgs& kargs,
+                                           const SplitKBatchOffset& splitk_batch_offset,
+                                           const index_t block_idx_m,
+                                           const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<DstInMemOp>(a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1);

        // Run Epilogue Pipeline
        auto& c_block_window = gemm_tile_windows.at(I2);

        EpiloguePipeline{}
            .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
-                c_block_window, c_block_tile, smem_ptr);
+                c_block_window, c_block_tile, smem_ptr_0);
    }

    CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
@@ -517,11 +571,27 @@ struct GemmKernel
        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);

        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
+        __shared__ char smem_ptr_0[GetSmemSize()];
+        __shared__ char smem_ptr_1[GetSmemSize()];

        if(kargs.k_batch == 1)
        {
-            RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+            if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+            {
+                RunGemm2LDS(a_ptr,
+                            b_ptr,
+                            c_ptr,
+                            smem_ptr_0,
+                            smem_ptr_1,
+                            kargs,
+                            splitk_batch_offset,
+                            i_m,
+                            i_n);
+            }
+            else
+            {
+                RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+            }
        }
        else
        {
@@ -530,8 +600,23 @@ struct GemmKernel
            if constexpr(!(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
                           is_any_of<CDataType, fp16_t, bf16_t>::value))
            {
-                RunGemm<memory_operation_enum::atomic_add>(
-                    a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+                if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+                {
+                    RunGemm2LDS<memory_operation_enum::atomic_add>(a_ptr,
+                                                                   b_ptr,
+                                                                   c_ptr,
+                                                                   smem_ptr_0,
+                                                                   smem_ptr_1,
+                                                                   kargs,
+                                                                   splitk_batch_offset,
+                                                                   i_m,
+                                                                   i_n);
+                }
+                else
+                {
+                    RunGemm<memory_operation_enum::atomic_add>(
+                        a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+                }
            }
        }
    }

--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -41,20 +41,26 @@ struct GemmPipelineAgBgCrImplBase
        store_tile(lds_tile_window, block_tile_tmp);
    }

+    template <typename DstBlockTile, typename SrcTileWindow>
+    CK_TILE_DEVICE void LocalPrefetch(DstBlockTile& dst_block_tile,
+                                      const SrcTileWindow& lds_tile_window) const
+    {
+        load_tile(dst_block_tile, lds_tile_window);
+    }
+
    CK_TILE_DEVICE auto GetABLdsTensorViews(void* p_smem) const
    {
        // A tile in LDS
-        ADataType* p_a_lds              = static_cast<ADataType*>(p_smem);
+        ADataType* __restrict__ p_a_lds = static_cast<ADataType*>(p_smem);
        constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);

        // TODO: LDS alignment should come from Policy!
-        constexpr index_t a_lds_block_space_size_aligned =
-            integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
-            16;
+        constexpr index_t a_lds_block_space_size_aligned = integer_least_multiple(
+            sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16);

        // B tile in LDS
-        BDataType* p_b_lds = static_cast<BDataType*>(
+        BDataType* __restrict__ p_b_lds = static_cast<BDataType*>(
            static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
        constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
        auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);