Merge branch 'develop' into codegen-enable-hiprtc

9aa17406 · arai713 · GitHub · d8de5ea5 · 4cfb24fe · 9aa17406
Unverified Commit 9aa17406 authored Feb 13, 2025 by arai713 Committed by GitHub Feb 13, 2025
20 changed files
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
 add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
+target_compile_options(tile_example_gemm_universal PRIVATE
+  -mllvm -enable-noalias-to-md-conversion=0
+)
--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic.hpp
@@ -11,21 +11,26 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
-#define CK_TILE_PIPELINE_COMPUTE 1
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
 #ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
 #endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
 #define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
 #define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
 #define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
 #define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
 #define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
 #define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
 #else
 #error "unsupported CK_TILE_PIPELINE_DEFAULT value"
 #endif
@@ -126,7 +131,8 @@ auto create_args(int argc, char* argv[])
        .insert("warmup", "50", "number of iterations before benchmark the kernel")
        .insert("repeat", "100", "number of iterations to benchmark the kernel")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
-        .insert("split_k", "1", "splitK value");
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)");
    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);

--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -110,6 +110,7 @@ int run_gemm_example_with_layouts(int argc,
    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
    int n_warmup            = arg_parser.get_int("warmup");
    int n_repeat            = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");  
    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -122,9 +123,19 @@ int run_gemm_example_with_layouts(int argc,
    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-    // TODO: add different init types
+    if (init_method == 0) {
-    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+    } else if (init_method == 1) {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+    } else if (init_method == 2) {
+        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
+        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
+    } else {
+        a_m_k.SetZero();
+        b_k_n.SetZero();
+    }
    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());

--- a/example/ck_tile/03_gemm/script/benchmark_basic.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_basic.sh
@@ -2,7 +2,6 @@
 EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
 VALID=1
 for b_matrix_layout in "C"; do
    for m in "64" "512" "1024" "2048"; do
        for n in "512" "1024" "2048"; do

--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -34,8 +34,10 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
    constexpr ck_tile::index_t M_Warp_Tile = 32;
    constexpr ck_tile::index_t N_Warp_Tile = 32;
    constexpr ck_tile::index_t K_Warp_Tile = 8;
+    constexpr bool DoubleSmemBuffer = false;
 #endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
    // Compute friendly for Intrawave scheduler
    constexpr ck_tile::index_t M_Tile = 256;
    constexpr ck_tile::index_t N_Tile = 256;
@@ -48,6 +50,24 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
    constexpr ck_tile::index_t M_Warp_Tile = 32;
    constexpr ck_tile::index_t N_Warp_Tile = 32;
    constexpr ck_tile::index_t K_Warp_Tile = 16;
+    constexpr bool DoubleSmemBuffer = false;
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 32;
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+    constexpr bool DoubleSmemBuffer = true;
 #endif
    constexpr bool kPadM = false;
@@ -70,8 +90,14 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
-        TileGemmUniversalTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout, TransposeC>;
+                                                                 kPadN,
+                                                                 kPadK,
+                                                                 DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 TransposeC>;
    using GemmPipelineProblem =
        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
@@ -99,8 +125,7 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                                                           has_hot_loop_v,
                                                                           tail_number_v>;
-        using GemmPipeline =
+        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-            GEMM_PIPELINE<UniversalGemmProblem, ck_tile::UniversalGemmPipelineAgBgCrPolicy>;
        using GemmEpilogue = ck_tile::CShuffleEpilogue<
            ck_tile::CShuffleEpilogueProblem<AccDataType,
                                             CDataType,
@@ -140,7 +165,7 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
    if(has_hot_loop)
    {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
        if(tail_num == ck_tile::TailNumber::Full)
        {
            Run(ck_tile::bool_constant<true>{},
@@ -215,6 +240,17 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
            }
        }
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+        if(tail_num == ck_tile::TailNumber::Three)
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+        }
+        else
+        {
+            Run(ck_tile::bool_constant<true>{},
+                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+        }
 #endif
    }
    else

--- a/example/ck_tile/15_fused_moe/fused_moe.hpp
+++ b/example/ck_tile/15_fused_moe/fused_moe.hpp
@@ -8,14 +8,15 @@
 struct fused_moe_args
 {
-    const void* a_ptr;              // [m, k], input token
+    const void* a_ptr;                 // [m, k], input token
-    const void* a_scale_ptr;        // [m, 1], token scale
+    const void* a_scale_ptr;           // [m, 1], token scale
-    const void* g_ptr;              // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w])
+    const void* g_ptr;                 // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w])
-    const void* d_ptr;              // [e, n, k], pre-shuffle([e, nr, kr, w])
+    const void* d_ptr;                 // [e, n, k], pre-shuffle([e, nr, kr, w])
-    const void* g_scale_ptr;        // [e, 1, n], gate(up) scale
+    const void* g_scale_ptr;           // [e, 1, n], gate(up) scale
-    const void* d_scale_ptr;        // [e, 1, k], down scale
+    const void* d_scale_ptr;           // [e, 1, k], down scale
-    const void* y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input
+    const void* y_smooth_scale_ptr;    // [e, 1, n], smooth-quant-scale for 2nd gemm input
-    void* o_ptr;                    // [m, k], output token (no need to do zeroing)
+    const void* local_expert_mask_ptr; // [e], local_expert_mask_ptr for EP
+    void* o_ptr;                       // [m, k], output token (no need to do zeroing)
    const void* topk_ids_ptr;    // [tokens, topk]
    const void* topk_weight_ptr; // [tokens, topk]
@@ -48,6 +49,8 @@ struct fused_moe_traits
    int activation;  // 0:gelu, 1:silu
    int gate_only;   // 0:g1u0, 1:g1u1
    int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
+    bool local_expert_masking; // if mask experts as local expert
 };
 float fused_moe(fused_moe_traits, fused_moe_args, const ck_tile::stream_config&);
--- a/example/ck_tile/15_fused_moe/fused_moesorting.hpp
+++ b/example/ck_tile/15_fused_moe/fused_moesorting.hpp
@@ -10,7 +10,8 @@
 struct fused_moesorting_trait
 {
    std::string index_type;
-    std::string weight_type; // currently always float
+    std::string weight_type;   // currently always float
+    bool local_expert_masking; // if mask experts as local expert
 };
 struct fused_moesorting_args : public ck_tile::MoeSortingHostArgs

--- a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -17,10 +17,11 @@ float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_conf
        return 1;
    }();
-    auto t0 = fused_moesorting_trait{"int32", "fp32"};
+    auto t0 = fused_moesorting_trait{"int32", "fp32", t.local_expert_masking};
    auto a0 = fused_moesorting_args{
        a.topk_ids_ptr,                              // const void* p_topk_ids;
        a.topk_weight_ptr,                           // const void* p_weights;
+        a.local_expert_mask_ptr,                     // const void* p_local_expert_mask;
        a.sorted_token_ids_ptr,                      // void* p_sorted_token_ids;
        a.sorted_weight_ptr,                         // void* p_sorted_weights;
        a.sorted_expert_ids_ptr,                     // void* p_sorted_expert_ids;

--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -24,20 +24,63 @@
    return ave_time;
 #else
-#define MOE_SORTING_DISPATCH_(sub_token_tile_, sub_token_onshot_)                                \
-    constexpr ck_tile::index_t sub_token_tile = sub_token_tile_;                                 \
+#define MOE_SORTING_DISPATCH_(sub_token_tile_, sub_token_onshot_, local_expert_masking_)                \
-    constexpr bool sub_token_onshot           = sub_token_onshot_;                               \
+    constexpr ck_tile::index_t sub_token_tile = sub_token_tile_;                                        \
-    using ms_problem =                                                                           \
+    constexpr bool sub_token_onshot           = sub_token_onshot_;                                      \
-        ck_tile::MoeSortingProblemEx<index_t, ms_weight_type, sub_token_tile, sub_token_onshot>; \
+    constexpr bool local_expert_masking       = local_expert_masking_;                                  \
-    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                                \
+    using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
-    auto kargs           = kernel::MakeKargs(a);                                                 \
+                                                    ms_weight_type,            \
-    const dim3 grids     = kernel::GridSize(a);                                                  \
+                                                    sub_token_tile,            \
-    const dim3 blocks    = kernel::BlockSize(a);                                                 \
+                                                    sub_token_onshot,          \
-    const auto lds_bytes = kernel::GetSmemSize(a);                                               \
+                                                    local_expert_masking>;     \
-    float ave_time       = ck_tile::launch_kernel(                                               \
+    using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
-        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));               \
+    auto kargs                                = kernel::MakeKargs(a);                                   \
+    const dim3 grids                          = kernel::GridSize(a);                                    \
+    const dim3 blocks                         = kernel::BlockSize(a);                                   \
+    const auto lds_bytes                      = kernel::GetSmemSize(a);                                 \
+    float ave_time                            = ck_tile::launch_kernel(                                 \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \
    return ave_time;
+#define MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_) \
+    if(row_ % 8 == 0)                                                                   \
+    {                                                                                   \
+        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_);             \
+    }                                                                                   \
+    else if(row_ % 4 == 0)                                                              \
+    {                                                                                   \
+        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_);             \
+    }                                                                                   \
+    else if(row_ % 2 == 0)                                                              \
+    {                                                                                   \
+        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_);             \
+    }                                                                                   \
+    else                                                                                \
+    {                                                                                   \
+        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_);             \
+    }
+#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)            \
+    if(is_sub_token_onshot)                                                 \
+    {                                                                       \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, true, local_expert_masking_)  \
+    }                                                                       \
+    else                                                                    \
+    {                                                                       \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, false, local_expert_masking_) \
+    }
+#define MOE_SORTING_DISPATCH_EMASK_(row_)        \
+    if(is_local_expert_masking)                  \
+    {                                            \
+        MOE_SORTING_DISPATCH_SUBTO_(row_, true)  \
+    }                                            \
+    else                                         \
+    {                                            \
+        MOE_SORTING_DISPATCH_SUBTO_(row_, false) \
+    }
 #endif
 #if !MOE_SORTING_USE_EX_KERNEL
@@ -116,45 +159,10 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
        auto sub_token_          = r_ - 2;
        r_                       = (r_ - 2) / 8;
        bool is_sub_token_onshot = a.tokens <= sub_token_;
+        bool is_local_expert_masking = t.local_expert_masking;
        (void)c_;
-        if(is_sub_token_onshot)
-        {
+        MOE_SORTING_DISPATCH_EMASK_(r_);
-            if(r_ % 8 == 0)
-            {
-                MOE_SORTING_DISPATCH_(8, true);
-            }
-            else if(r_ % 4 == 0)
-            {
-                MOE_SORTING_DISPATCH_(4, true);
-            }
-            else if(r_ % 2 == 0)
-            {
-                MOE_SORTING_DISPATCH_(2, true);
-            }
-            else
-            {
-                MOE_SORTING_DISPATCH_(1, true);
-            }
-        }
-        else
-        {
-            if(r_ % 8 == 0)
-            {
-                MOE_SORTING_DISPATCH_(8, false);
-            }
-            else if(r_ % 4 == 0)
-            {
-                MOE_SORTING_DISPATCH_(4, false);
-            }
-            else if(r_ % 2 == 0)
-            {
-                MOE_SORTING_DISPATCH_(2, false);
-            }
-            else
-            {
-                MOE_SORTING_DISPATCH_(1, false);
-            }
-        }
        // MOE_SORTING_DISPATCH_ETILE(0, 0);
 #endif
    }

--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -140,28 +140,29 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::index_t activation        = arg_parser.get_int("act");
    if(stride < 0)
        stride = hidden_size;
-    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_i        = arg_parser.get_str("prec_i");
-    std::string prec_w  = arg_parser.get_str("prec_w");
+    std::string prec_w        = arg_parser.get_str("prec_w");
-    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_o        = arg_parser.get_str("prec_o");
-    std::string prec_st = arg_parser.get_str("prec_st");
+    std::string prec_st       = arg_parser.get_str("prec_st");
-    std::string prec_sw = arg_parser.get_str("prec_sw");
+    std::string prec_sw       = arg_parser.get_str("prec_sw");
-    std::string prec_sq = arg_parser.get_str("prec_sq");
+    std::string prec_sq       = arg_parser.get_str("prec_sq");
-    std::string prec_kw = arg_parser.get_str("prec_kw");
+    std::string prec_kw       = arg_parser.get_str("prec_kw");
-    prec_st             = (prec_st == "auto") ? "fp32" : prec_st;
+    prec_st                   = (prec_st == "auto") ? "fp32" : prec_st;
-    prec_sw             = (prec_sw == "auto") ? "fp32" : prec_sw;
+    prec_sw                   = (prec_sw == "auto") ? "fp32" : prec_sw;
-    prec_sq             = (prec_sq == "auto") ? "fp32" : prec_sq;
+    prec_sq                   = (prec_sq == "auto") ? "fp32" : prec_sq;
-    prec_kw             = (prec_kw == "auto") ? "fp32" : prec_kw;
+    prec_kw                   = (prec_kw == "auto") ? "fp32" : prec_kw;
-    int kname           = arg_parser.get_int("kname");
+    int kname                 = arg_parser.get_int("kname");
-    int do_validation   = arg_parser.get_int("v");
+    int do_validation         = arg_parser.get_int("v");
-    int warmup          = arg_parser.get_int("warmup");
+    int warmup                = arg_parser.get_int("warmup");
-    int repeat          = arg_parser.get_int("repeat");
+    int repeat                = arg_parser.get_int("repeat");
-    int fused_quant     = arg_parser.get_int("fquant");
+    int fused_quant           = arg_parser.get_int("fquant");
-    int gate_only       = arg_parser.get_int("gate_only");
+    int gate_only             = arg_parser.get_int("gate_only");
-    int api             = arg_parser.get_int("api");
+    int api                   = arg_parser.get_int("api");
-    int balance         = arg_parser.get_int("balance");
+    int balance               = arg_parser.get_int("balance");
-    int tp              = arg_parser.get_int("tp");
+    int tp                    = arg_parser.get_int("tp");
-    int init            = arg_parser.get_int("init");
+    int init                  = arg_parser.get_int("init");
-    uint32_t seed       = arg_parser.get_uint32("seed");
+    uint32_t seed             = arg_parser.get_uint32("seed");
+    bool local_expert_masking = false; // TODO...
    // w0 (Gate+Up or Gate only, N size)
    ck_tile::index_t shared_intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2) / tp;
@@ -230,6 +231,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    ck_tile::HostTensor<YSmoothScaleDataType> sy_host({shared_intermediate_size_1}); // smooth-quant
    ck_tile::HostTensor<IndexDataType> topk_ids_host({tokens, topk});                // to be sort
    ck_tile::HostTensor<TopkWeightDataType> topk_weight_host({tokens, topk});        // to be sort
+    ck_tile::HostTensor<IndexDataType> local_expert_mask_host({experts});
    int max_num_tokens_padded = topk * tokens + experts * block_m - topk;
    ck_tile::HostTensor<IndexDataType> sorted_token_ids_host({max_num_tokens_padded});
@@ -355,6 +357,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        ck_tile::DeviceMem sg_buf(sg_host);
        ck_tile::DeviceMem sd_buf(sd_host);
        ck_tile::DeviceMem sy_buf(sy_host);
+        ck_tile::DeviceMem local_expert_mask_buf(local_expert_mask_host);
        ck_tile::DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
        ck_tile::DeviceMem topk_ids_buf(topk_ids_host);
@@ -378,7 +381,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                block_m,
                                activation,
                                gate_only,
-                                fused_quant};
+                                fused_quant,
+                                local_expert_masking};
        fused_moe_args args{a_buf.GetDeviceBuffer(),
                            fused_quant != 0 ? sa_buf.GetDeviceBuffer() : nullptr,
@@ -387,6 +391,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                            fused_quant != 0 ? sg_buf.GetDeviceBuffer() : nullptr,
                            fused_quant != 0 ? sd_buf.GetDeviceBuffer() : nullptr,
                            fused_quant == 1 ? sy_buf.GetDeviceBuffer() : nullptr,
+                            local_expert_masking ? local_expert_mask_buf.GetDeviceBuffer()
+                                                 : nullptr,
                            o_buf.GetDeviceBuffer(),
                            topk_ids_buf.GetDeviceBuffer(),
                            topk_weight_buf.GetDeviceBuffer(),
@@ -442,12 +448,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
            ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
                topk_ids_host,
                topk_weight_host,
+                local_expert_mask_host,
                sorted_token_ids_host,
                sorted_weight_host,
                sorted_expert_ids_host,
                num_sorted_tiles_host.mData[0],
                experts,
-                block_m);
+                block_m,
+                local_expert_masking);
            if(activation == 0)
            {
                CPU_FUSED_MOE(ck_tile::element_wise::Gelu);
@@ -472,12 +480,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
        ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
            topk_ids_host,
            topk_weight_host,
+            local_expert_mask_host,
            sorted_token_ids_host,
            sorted_weight_host,
            sorted_expert_ids_host,
            num_sorted_tiles_host.mData[0],
            experts,
-            block_m);
+            block_m,
+            local_expert_masking);
        // done, preparing GPU buffer
        ck_tile::DeviceMem a_buf(a_host);

--- a/include/ck_tile/core/utility/transpose_vectors.hpp
+++ b/include/ck_tile/core/utility/transpose_vectors.hpp
@@ -68,52 +68,82 @@ struct transpose_vectors
        }
        else if constexpr(sizeof(S) == 1)
        {
-            static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
+            static_assert(((NX % 4 == 0 && NY % 4 == 0) || (NX % 2 == 0 && NY % 2 == 0)), "wrong!");
            using S4 = array<S, 4>; // typename array<S, 4>::type;
+            using S2 = array<S, 2>; // typename array<S, 4>::type;
-            // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
-            static_for<0, NY, 4>{}([&](auto iy) {
+            if constexpr(NX % 4 == 0 && NY % 4 == 0)
-                static_for<0, NX, 4>{}([&](auto ix) {
+            {
-                    // 4 int8x4 data from vx_tuple
+                // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
-                    const int32_t x_s4_0 =
+                static_for<0, NY, 4>{}([&](auto iy) {
-                        bit_cast<int32_t>(vx_tuple[ix].template get_as<S4>()[iy / I4]);
+                    static_for<0, NX, 4>{}([&](auto ix) {
-                    const int32_t x_s4_1 =
+                        // 4 int8x4 data from vx_tuple
-                        bit_cast<int32_t>(vx_tuple[ix + I1].template get_as<S4>()[iy / I4]);
+                        const int32_t x_s4_0 =
-                    const int32_t x_s4_2 =
+                            bit_cast<int32_t>(vx_tuple[ix].template get_as<S4>()[iy / I4]);
-                        bit_cast<int32_t>(vx_tuple[ix + I2].template get_as<S4>()[iy / I4]);
+                        const int32_t x_s4_1 =
-                    const int32_t x_s4_3 =
+                            bit_cast<int32_t>(vx_tuple[ix + I1].template get_as<S4>()[iy / I4]);
-                        bit_cast<int32_t>(vx_tuple[ix + I3].template get_as<S4>()[iy / I4]);
+                        const int32_t x_s4_2 =
+                            bit_cast<int32_t>(vx_tuple[ix + I2].template get_as<S4>()[iy / I4]);
-                    // transpose
+                        const int32_t x_s4_3 =
-                    int32_t t_s4_0, t_s4_1;
+                            bit_cast<int32_t>(vx_tuple[ix + I3].template get_as<S4>()[iy / I4]);
-                    int32_t y_s4_0, y_s4_1, y_s4_2, y_s4_3;
+                        // transpose
-                    constexpr int32_t m0 = 0x05010400;
+                        int32_t t_s4_0, t_s4_1;
-                    constexpr int32_t m1 = 0x05040100;
+                        int32_t y_s4_0, y_s4_1, y_s4_2, y_s4_3;
-                    constexpr int32_t m2 = 0x07060302;
-                    constexpr int32_t m3 = 0x07030602;
+                        constexpr int32_t m0 = 0x05010400;
+                        constexpr int32_t m1 = 0x05040100;
-                    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
+                        constexpr int32_t m2 = 0x07060302;
-                    //                   -- -- -- --     -- -- -- --      -  -  -  -
+                        constexpr int32_t m3 = 0x07030602;
-                    //             index  7  6  5  4      3  2  1  0     33 77 44 88
-                    // index is reversed because of little endianness (least significant bits first)
+                        // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) ->
-                    t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m0);
+                        // 0x33774488
-                    t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m0);
+                        //                   -- -- -- --     -- -- -- --      -  -  -  -
-                    y_s4_0 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
+                        //             index  7  6  5  4      3  2  1  0     33 77 44 88
-                    y_s4_1 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
+                        // index is reversed because of little endianness (least significant bits
-                    t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m3);
+                        // first)
-                    t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m3);
+                        t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m0);
-                    y_s4_2 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
+                        t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m0);
-                    y_s4_3 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
+                        y_s4_0 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
+                        y_s4_1 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
-                    // 4 int8x4 data from vy_tuple
+                        t_s4_0 = __builtin_amdgcn_perm(x_s4_1, x_s4_0, m3);
-                    vy_tuple(iy).template get_as<S4>()(ix / I4)      = bit_cast<S4>(y_s4_0);
+                        t_s4_1 = __builtin_amdgcn_perm(x_s4_3, x_s4_2, m3);
-                    vy_tuple(iy + I1).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_1);
+                        y_s4_2 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m1);
-                    vy_tuple(iy + I2).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_2);
+                        y_s4_3 = __builtin_amdgcn_perm(t_s4_1, t_s4_0, m2);
-                    vy_tuple(iy + I3).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_3);
+                        // 4 int8x4 data from vy_tuple
+                        vy_tuple(iy).template get_as<S4>()(ix / I4)      = bit_cast<S4>(y_s4_0);
+                        vy_tuple(iy + I1).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_1);
+                        vy_tuple(iy + I2).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_2);
+                        vy_tuple(iy + I3).template get_as<S4>()(ix / I4) = bit_cast<S4>(y_s4_3);
+                    });
                });
-            });
+            }
+            else if constexpr(NX % 2 == 0 && NY % 2 == 0)
+            {
+                static_for<0, NY, 2>{}([&](auto ix) {
+                    static_for<0, NX, 2>{}([&](auto iy) {
+                        const int16_t x_s2_0 =
+                            bit_cast<int16_t>(vx_tuple[ix].template get_as<S2>()[iy / I2]);
+                        const int16_t x_s2_1 =
+                            bit_cast<int16_t>(vx_tuple[ix + I1].template get_as<S2>()[iy / I2]);
+                        constexpr int32_t m0 = 0x05040100;
+                        constexpr int32_t m1 = 0x07060302;
+                        const int32_t x0_32 = static_cast<int32_t>(x_s2_0 & 0xFFFF);
+                        const int32_t x1_32 = static_cast<int32_t>(x_s2_1 & 0xFFFF);
+                        const int32_t y_s2_0 = __builtin_amdgcn_perm(x1_32, x0_32, m0);
+                        const int32_t y_s2_1 = __builtin_amdgcn_perm(x1_32, x0_32, m1);
+                        vy_tuple(iy).template get_as<S2>()[ix / I2] =
+                            bit_cast<S2>(static_cast<int16_t>(y_s2_0 & 0xFFFF));
+                        vy_tuple(iy + I1).template get_as<S2>()[ix / I2] =
+                            bit_cast<S2>(static_cast<int16_t>(y_s2_1 & 0xFFFF));
+                    });
+                });
+            }
        }
        else
        {

--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
@@ -343,6 +343,8 @@ struct BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
        // moving k_dram_window is an in-page-block operation, so there is
        // no need to invoke k_page_block_navigator.move_tile_window() here.
        move_tile_window(k_dram_window, {0, kK0});
+        // ensure LDS access by Q is done before the over-writting by K
+        block_sync_lds();
        store_tile(k_lds_window, tile_elementwise_in(k_element_func, k_block_tile));
        do

--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -29,6 +29,8 @@
 #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"

--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp
@@ -14,24 +14,54 @@ namespace ck_tile {
 template <typename Problem_, typename Policy_ = BlockGemmARegBRegCRegV1DefaultPolicy>
 struct BlockGemmARegBRegCRegV1
 {
-    using Problem        = remove_cvref_t<Problem_>;
+    private:
-    using Policy         = remove_cvref_t<Policy_>;
+    template <typename PipelineProblem_, typename GemmPolicy_>
-    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    struct GemmTraits_
-    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    {
-    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+        using Problem        = remove_cvref_t<PipelineProblem_>;
-    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+        using Policy         = remove_cvref_t<GemmPolicy_>;
+        using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-    static constexpr index_t kBlockSize   = Problem::kBlockSize;
+        using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-    static constexpr index_t MPerBlock    = BlockGemmShape::kM;
+        using CDataType      = remove_cvref_t<typename Problem::CDataType>;
-    static constexpr index_t NPerBlock    = BlockGemmShape::kN;
+        using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
-    static constexpr index_t KPerBlock    = BlockGemmShape::kK;
-    static constexpr auto config          = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        static constexpr index_t kBlockSize = Problem::kBlockSize;
-    using WG                              = remove_cvref_t<decltype(config.template at<0>())>;
-    static constexpr index_t MWarp        = config.template at<1>();
+        static constexpr index_t MPerBlock = BlockGemmShape::kM;
-    static constexpr index_t NWarp        = config.template at<2>();
+        static constexpr index_t NPerBlock = BlockGemmShape::kN;
-    static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        static constexpr index_t KPerBlock = BlockGemmShape::kK;
-    static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
-    static constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+        static constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WarpGemm               = remove_cvref_t<decltype(config.template at<0>())>;
+        static constexpr index_t MWarp        = config.template at<1>();
+        static constexpr index_t NWarp        = config.template at<2>();
+        static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
+        static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
+        static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
+        static constexpr index_t KPack = WarpGemm::kKPerThread;
+    };
+    public:
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+    using Traits = GemmTraits_<Problem, Policy>;
+    using WarpGemm       = typename Traits::WarpGemm;
+    using BlockGemmShape = typename Traits::BlockGemmShape;
+    using ADataType = remove_cvref_t<typename Traits::ADataType>;
+    using BDataType = remove_cvref_t<typename Traits::BDataType>;
+    using CDataType = remove_cvref_t<typename Traits::CDataType>;
+    static constexpr index_t KIterPerWarp = Traits::KIterPerWarp;
+    static constexpr index_t MIterPerWarp = Traits::MIterPerWarp;
+    static constexpr index_t NIterPerWarp = Traits::NIterPerWarp;
+    static constexpr index_t MWarp = Traits::MWarp;
+    static constexpr index_t NWarp = Traits::NWarp;
    CK_TILE_DEVICE static constexpr auto MakeABlockDistributionEncode()
    {
@@ -43,7 +73,7 @@ struct BlockGemmARegBRegCRegV1
                                       sequence<1, 2>,
                                       sequence<0, 0>>{};
        constexpr auto a_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            a_block_outer_dstr_encoding, typename WG::AWarpDstrEncoding{});
+            a_block_outer_dstr_encoding, typename WarpGemm::AWarpDstrEncoding{});
        return a_block_dstr_encode;
    }
@@ -58,7 +88,7 @@ struct BlockGemmARegBRegCRegV1
                                       sequence<1, 2>,
                                       sequence<0, 0>>{};
        constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            b_block_outer_dstr_encoding, typename WG::BWarpDstrEncoding{});
+            b_block_outer_dstr_encoding, typename WarpGemm::BWarpDstrEncoding{});
        return b_block_dstr_encode;
    }
@@ -73,7 +103,7 @@ struct BlockGemmARegBRegCRegV1
            sequence<1, 2>,
            sequence<0, 0>>{};
        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
        return c_block_dstr_encode;
    }
@@ -112,13 +142,13 @@ struct BlockGemmARegBRegCRegV1
                                                       .get_static_tile_distribution_encoding())>>,
            "C distribution is wrong!");
-        using AWarpDstr = typename WG::AWarpDstr;
+        using AWarpDstr = typename WarpGemm::AWarpDstr;
-        using BWarpDstr = typename WG::BWarpDstr;
+        using BWarpDstr = typename WarpGemm::BWarpDstr;
-        using CWarpDstr = typename WG::CWarpDstr;
+        using CWarpDstr = typename WarpGemm::CWarpDstr;
-        using AWarpTensor = typename WG::AWarpTensor;
+        using AWarpTensor = typename WarpGemm::AWarpTensor;
-        using BWarpTensor = typename WG::BWarpTensor;
+        using BWarpTensor = typename WarpGemm::BWarpTensor;
-        using CWarpTensor = typename WG::CWarpTensor;
+        using CWarpTensor = typename WarpGemm::CWarpTensor;
        constexpr auto a_warp_y_lengths =
            to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
@@ -157,7 +187,7 @@ struct BlockGemmARegBRegCRegV1
                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
                    // warp GEMM
-                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                    WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
                    // write C warp tensor into C block tensor
                    c_block_tensor.set_y_sliced_thread_data(
@@ -180,7 +210,7 @@ struct BlockGemmARegBRegCRegV1
            sequence<0, 0>>{};
        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
        return c_block_tensor;

--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -463,7 +463,9 @@ struct GemmKernel
     * @param a_ptr input A pointer
     * @param b_ptr input B pointer
     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The start memory pointer of the shared memory block.
     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset splitk_batch_offset Utility structure used to calculate k batch.
     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
     *
@@ -473,7 +475,7 @@ struct GemmKernel
    CK_TILE_DEVICE static void RunGemm(const ADataType* a_ptr,
                                       const BDataType* b_ptr,
                                       CDataType* c_ptr,
-                                       void* smem_ptr,
+                                       void* smem_ptr_0,
                                       const GemmKernelArgs& kargs,
                                       const SplitKBatchOffset& splitk_batch_offset,
                                       const index_t block_idx_m,
@@ -491,15 +493,67 @@ struct GemmKernel
        // Run GEMM cooperatively by whole workgroup.
        const auto& a_block_window = gemm_tile_windows.at(I0);
        const auto& b_block_window = gemm_tile_windows.at(I1);
-        const auto& c_block_tile =
-            GemmPipeline{}.template operator()(a_block_window, b_block_window, num_loop, smem_ptr);
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0);
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I2);
+        EpiloguePipeline{}
+            .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
+                c_block_window, c_block_tile, smem_ptr_0);
+    }
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
+     *
+     * @param a_ptr input A pointer
+     * @param b_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param smem_ptr_0 The starting pointer of 1st shared memory block.
+     * @param smem_ptr_1 The starting pointer of 2nd shared memory block.
+     * @param kargs GEMM kernel arguments
+     * @param splitk_batch_offset Utility structure used to calculate k batch.
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     * @tparam DstInMemOp Destination memory operation (default: set).
+     */
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static void RunGemm2LDS(const ADataType* a_ptr,
+                                           const BDataType* b_ptr,
+                                           CDataType* c_ptr,
+                                           void* __restrict__ smem_ptr_0,
+                                           void* __restrict__ smem_ptr_1,
+                                           const GemmKernelArgs& kargs,
+                                           const SplitKBatchOffset& splitk_batch_offset,
+                                           const index_t block_idx_m,
+                                           const index_t block_idx_n)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            MakeGemmTensorViews<DstInMemOp>(a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+        const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& c_block_tile = GemmPipeline{}.template operator()(
+            a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1);
        // Run Epilogue Pipeline
        auto& c_block_window = gemm_tile_windows.at(I2);
        EpiloguePipeline{}
            .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
-                c_block_window, c_block_tile, smem_ptr);
+                c_block_window, c_block_tile, smem_ptr_0);
    }
    CK_TILE_DEVICE void operator()(GemmKernelArgs kargs) const
@@ -517,11 +571,27 @@ struct GemmKernel
        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
        // allocate LDS
-        __shared__ char smem_ptr[GetSmemSize()];
+        __shared__ char smem_ptr_0[GetSmemSize()];
+        __shared__ char smem_ptr_1[GetSmemSize()];
        if(kargs.k_batch == 1)
        {
-            RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+            if constexpr(GemmPipeline::DoubleSmemBuffer == true)
+            {
+                RunGemm2LDS(a_ptr,
+                            b_ptr,
+                            c_ptr,
+                            smem_ptr_0,
+                            smem_ptr_1,
+                            kargs,
+                            splitk_batch_offset,
+                            i_m,
+                            i_n);
+            }
+            else
+            {
+                RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+            }
        }
        else
        {
@@ -530,8 +600,23 @@ struct GemmKernel
            if constexpr(!(EpiloguePipeline::template GetVectorSizeC<CDataType>() % 2 != 0 &&
                           is_any_of<CDataType, fp16_t, bf16_t>::value))
            {
-                RunGemm<memory_operation_enum::atomic_add>(
+                if constexpr(GemmPipeline::DoubleSmemBuffer == true)
-                    a_ptr, b_ptr, c_ptr, smem_ptr, kargs, splitk_batch_offset, i_m, i_n);
+                {
+                    RunGemm2LDS<memory_operation_enum::atomic_add>(a_ptr,
+                                                                   b_ptr,
+                                                                   c_ptr,
+                                                                   smem_ptr_0,
+                                                                   smem_ptr_1,
+                                                                   kargs,
+                                                                   splitk_batch_offset,
+                                                                   i_m,
+                                                                   i_n);
+                }
+                else
+                {
+                    RunGemm<memory_operation_enum::atomic_add>(
+                        a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
+                }
            }
        }
    }

--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -41,20 +41,26 @@ struct GemmPipelineAgBgCrImplBase
        store_tile(lds_tile_window, block_tile_tmp);
    }
+    template <typename DstBlockTile, typename SrcTileWindow>
+    CK_TILE_DEVICE void LocalPrefetch(DstBlockTile& dst_block_tile,
+                                      const SrcTileWindow& lds_tile_window) const
+    {
+        load_tile(dst_block_tile, lds_tile_window);
+    }
    CK_TILE_DEVICE auto GetABLdsTensorViews(void* p_smem) const
    {
        // A tile in LDS
-        ADataType* p_a_lds              = static_cast<ADataType*>(p_smem);
+        ADataType* __restrict__ p_a_lds = static_cast<ADataType*>(p_smem);
        constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
        // TODO: LDS alignment should come from Policy!
-        constexpr index_t a_lds_block_space_size_aligned =
+        constexpr index_t a_lds_block_space_size_aligned = integer_least_multiple(
-            integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
+            sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16);
-            16;
        // B tile in LDS
-        BDataType* p_b_lds = static_cast<BDataType*>(
+        BDataType* __restrict__ p_b_lds = static_cast<BDataType*>(
            static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
        constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
        auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);

--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -76,6 +76,8 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
    static constexpr bool kPadN = Problem::kPadN;
    static constexpr bool kPadK = Problem::kPadK;
+    static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
    static constexpr bool HasHotLoop = Problem::HasHotLoop;
    static constexpr auto TailNum    = Problem::TailNum;
    static constexpr auto Scheduler  = Problem::Scheduler;

--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+namespace ck_tile {
+// Default policy for GemmPipelineAGmemBGmemCregComputeV4, except the block gemm method, it shares
+// the same vector size implementation, SmemSize, Global memory tile distiribution as the
+// UniversalGemm Pipeline Policy.
+// Default policy class should not be templated, put template on
+// member functions instead.
+struct GemmPipelineAgBgCrCompV4DefaultPolicy
+    : public UniversalGemmBasePolicy<GemmPipelineAgBgCrCompV4DefaultPolicy>
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack      = GetSmemPackA<Problem>();
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / KPack>{}, number<kMPerBlock>{}, number<KPack>{}),
+            make_tuple(number<kMPerBlock * KPack>{}, number<KPack>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(
+                make_pass_through_transform(number<kMPerBlock>{}),
+                make_merge_transform(make_tuple(number<kKPerBlock>{} / KPack, number<KPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+        return a_lds_block_desc;
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
+    {
+        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack      = GetSmemPackB<Problem>();
+        constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / KPack>{}, number<kNPerBlock>{}, number<KPack>{}),
+            make_tuple(number<(kNPerBlock)*KPack>{}, number<KPack>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+        constexpr auto b_lds_block_desc = transform_tensor_descriptor(
+            b_lds_block_desc_0,
+            make_tuple(
+                make_pass_through_transform(number<kNPerBlock>{}),
+                make_merge_transform(make_tuple(number<kKPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+        return b_lds_block_desc;
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
+    {
+        using AccDataType     = float;
+        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                typename Problem::BDataType,
+                                                AccDataType,
+                                                WarpTile::at(I0),
+                                                WarpTile::at(I1),
+                                                WarpTile::at(I2),
+                                                Problem::TransposeC>;
+        using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<typename Problem::ADataType,
+                                                                    typename Problem::BDataType,
+                                                                    typename Problem::CDataType,
+                                                                    BlockWarps,
+                                                                    WarpGemm>;
+        return BlockGemmARegBRegCRegV1<Problem, BlockGemmPolicy>{};
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -124,6 +124,8 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
    static constexpr bool kPadN = Problem::kPadN;
    static constexpr bool kPadK = Problem::kPadK;
+    static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
    // Where is the right place for HasHotLoop and TailNum ???
    static constexpr bool HasHotLoop = Problem::HasHotLoop;
    static constexpr auto TailNum    = Problem::TailNum;