Merge branch 'develop' into muozturk_bf16fp8_streamk

27fb084f · Muhammed Emin Ozturk · GitHub · 0b5ad335 · c0adab48 · 27fb084f
Unverified Commit 27fb084f authored Feb 11, 2025 by Muhammed Emin Ozturk Committed by GitHub Feb 11, 2025
20 changed files
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -82,8 +82,11 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
    if(s.log_level_ > 0)
    {
-        std::cout << "Launching kernel with args:"
+        std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << "shape: " << CodegenGemmShape::GetName() << '\n'
+                  << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                  << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                  << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
                  << std::endl;
    }

--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -173,7 +173,7 @@ int run_gemm_example_with_layouts(int argc,
        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
                  << std::endl;
-        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
    }
    else if(arg_parser.get_int("v") == 2)
    {
@@ -231,7 +231,7 @@ int run_gemm_example_with_layouts(int argc,
        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
                  << std::endl;
-        std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
+        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
    }
    return pass;

--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -26,6 +26,10 @@ auto create_args(int argc, char* argv[])
        .insert("k", "4", "topk")
        .insert("unit", "32", "unit_size")
        .insert("moe_buf_size", "0", "moe_buf_size")
+        .insert("local_eid",
+                "-1",
+                "a list of experts enabled as local expert. e.g. \"0,1,4,5\"\n"
+                "please make sure eid is in ascending order!")
        .insert("seed", "-1", "seed to be used, -1 means random every time")
        .insert("kname", "0", "when set to 1 it will print kernel name")
        .insert("warmup", "5", "number of iterations before benchmark the kernel")
@@ -74,6 +78,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
    int kname               = args.get_int("kname");
    int warmup              = args.get_int("warmup");
    int repeat              = args.get_int("repeat");
    int max_output_ids =
        ck_tile::integer_least_multiple(topk * tokens + num_experts * unit_size - topk, unit_size);
@@ -90,6 +95,30 @@ bool test_moe_sorting(ck_tile::ArgParser args)
        return false;
    }
+    bool local_expert_masking      = args.get_str("local_eid") != "-1";
+    auto local_expert_masking_host = [&]() {
+        if(local_expert_masking)
+        {
+            auto local_eid = args.get_int_vec("local_eid");
+            // std::vector<int> v_ {num_experts, 0};
+            ck_tile::HostTensor<IndexType> v_{{num_experts}};
+            v_.SetZero();
+            for(auto eid : local_eid)
+            {
+                if(eid >= num_experts)
+                {
+                    throw std::runtime_error(
+                        "local_eid larger than number of expert, please check");
+                }
+                v_.mData[eid] = 1;
+            }
+            return v_;
+        }
+        else
+            // return std::vector<int>{};
+            return ck_tile::HostTensor<IndexType>{{1}};
+    }();
    // tokens already considered batch size
    ck_tile::HostTensor<IndexType> topk_ids_host({tokens, topk}, {topk, 1});
    ck_tile::HostTensor<WeightType> weights_host({tokens, topk}, {topk, 1});
@@ -111,6 +140,8 @@ bool test_moe_sorting(ck_tile::ArgParser args)
        sorted_expert_ids_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem sorted_id_cnt_dev(sorted_id_cnt_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem moe_buf_dev(moe_buf_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem local_expert_masking_dev(
+        local_expert_masking_host.get_element_space_size_in_bytes());
    topk_ids_dev.ToDevice(topk_ids_host.data());
    weights_dev.ToDevice(weights_host.data());
@@ -118,11 +149,15 @@ bool test_moe_sorting(ck_tile::ArgParser args)
    {
        moe_buf_dev.ToDevice(moe_buf_host.data());
    }
+    if(local_expert_masking)
+        local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
-    moe_sorting_trait trait{index_prec, weight_prec};
+    moe_sorting_trait trait{index_prec, weight_prec, local_expert_masking};
    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
                          weights_dev.GetDeviceBuffer(),
+                          local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
+                                               : nullptr,
                          sorted_ids_dev.GetDeviceBuffer(),
                          sorted_weights_dev.GetDeviceBuffer(),
                          sorted_expert_ids_dev.GetDeviceBuffer(),
@@ -140,15 +175,22 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                              warmup,
                              repeat};
    auto ms = moe_sorting(trait, karg, sc);
-    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d,  ms:%f , ",
+    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d, ",
           index_prec.c_str(),
           weight_prec.c_str(),
           tokens,
           num_experts,
-           topk,
+           topk);
-           ms);
+    if(local_expert_masking)
+    {
+        printf("local_eid:%s, ", args.get_str("local_eid").c_str());
+    }
    if(ms < 0)
        printf("not supported\n");
+    else
+        printf("ms:%f, ", ms);
    fflush(stdout);
    if(ms < 0)
    {
@@ -174,12 +216,14 @@ bool test_moe_sorting(ck_tile::ArgParser args)
        int32_t ref_total_tokens_post_pad = 0;
        ck_tile::reference_moe_sorting<WeightType, IndexType>(topk_ids_host,
                                                              weights_host,
+                                                              local_expert_masking_host,
                                                              sorted_ids_ref,
                                                              sorted_weights_ref,
                                                              sorted_expert_ids_ref,
                                                              ref_total_tokens_post_pad,
                                                              num_experts,
-                                                              unit_size);
+                                                              unit_size,
+                                                              local_expert_masking);
        rtn &= ck_tile::check_err(
            sorted_ids_host, sorted_ids_ref, std::string("OUT Error: Incorrect ids!"), 1e-6, 1e-6);
        rtn &= ck_tile::check_err(sorted_weights_host,
@@ -199,9 +243,16 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
        }
        rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
+        printf("total_tokens_post_pad:%d(%d), ",
+               ref_total_tokens_post_pad,
+               sorted_id_cnt_host.mData[0]);
    }
-    printf("valid:%s\n", rtn ? "y" : "n");
+    printf("valid:%s", rtn ? "y" : "n");
+    fflush(stdout);
+    if(!rtn)
+        printf(", (%d)", seed);
+    printf("\n");
    fflush(stdout);
    return rtn;
 }

--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -3,6 +3,12 @@
 #include "moe_sorting_api.hpp"
+#ifndef MOE_SORTING_USE_EX_KERNEL
+#define MOE_SORTING_USE_EX_KERNEL 1
+#endif
+#if !MOE_SORTING_USE_EX_KERNEL
 #define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_)                         \
    constexpr ck_tile::index_t unroll_num  = unroll_num_;                             \
    constexpr ck_tile::index_t expert_tile = expert_tile_;                            \
@@ -17,6 +23,67 @@
        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));    \
    return ave_time;
+#else
+#define MOE_SORTING_DISPATCH_(sub_token_tile_, sub_token_onshot_, local_expert_masking_)                \
+    constexpr ck_tile::index_t sub_token_tile = sub_token_tile_;                                        \
+    constexpr bool sub_token_onshot           = sub_token_onshot_;                                      \
+    constexpr bool local_expert_masking       = local_expert_masking_;                                  \
+    using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
+                                                    ms_weight_type,            \
+                                                    sub_token_tile,            \
+                                                    sub_token_onshot,          \
+                                                    local_expert_masking>;     \
+    using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
+    auto kargs                                = kernel::MakeKargs(a);                                   \
+    const dim3 grids                          = kernel::GridSize(a);                                    \
+    const dim3 blocks                         = kernel::BlockSize(a);                                   \
+    const auto lds_bytes                      = kernel::GetSmemSize(a);                                 \
+    float ave_time                            = ck_tile::launch_kernel(                                 \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \
+    return ave_time;
+#define MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_) \
+    if(row_ % 8 == 0)                                                                   \
+    {                                                                                   \
+        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_);             \
+    }                                                                                   \
+    else if(row_ % 4 == 0)                                                              \
+    {                                                                                   \
+        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_);             \
+    }                                                                                   \
+    else if(row_ % 2 == 0)                                                              \
+    {                                                                                   \
+        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_);             \
+    }                                                                                   \
+    else                                                                                \
+    {                                                                                   \
+        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_);             \
+    }
+#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)            \
+    if(is_sub_token_onshot)                                                 \
+    {                                                                       \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, true, local_expert_masking_)  \
+    }                                                                       \
+    else                                                                    \
+    {                                                                       \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, false, local_expert_masking_) \
+    }
+#define MOE_SORTING_DISPATCH_EMASK_(row_)        \
+    if(is_local_expert_masking)                  \
+    {                                            \
+        MOE_SORTING_DISPATCH_SUBTO_(row_, true)  \
+    }                                            \
+    else                                         \
+    {                                            \
+        MOE_SORTING_DISPATCH_SUBTO_(row_, false) \
+    }
+#endif
+#if !MOE_SORTING_USE_EX_KERNEL
 #define MOE_SORTING_DISPATCH(unroll_num_)           \
    if(a.num_experts <= 8)                          \
    {                                               \
@@ -38,11 +105,13 @@
    {                                               \
        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 0)  \
    }
+#endif
 float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
 {
    if(t.weight_type == "fp32" && t.index_type == "int32")
    {
+#if !MOE_SORTING_USE_EX_KERNEL
        if(a.num_experts > 127)
        {
            printf("lds size exceed, only support experts <127 \n");
@@ -83,6 +152,19 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
            MOE_SORTING_DISPATCH(4);
        }
        }
+#else
+        using index_t            = ck_tile::index_t;
+        using ms_weight_type     = float;
+        auto [r_, c_]            = ck_tile::moe_sorting_get_smem_row_col(a.tokens, a.num_experts);
+        auto sub_token_          = r_ - 2;
+        r_                       = (r_ - 2) / 8;
+        bool is_sub_token_onshot = a.tokens <= sub_token_;
+        bool is_local_expert_masking = t.local_expert_masking;
+        (void)c_;
+        MOE_SORTING_DISPATCH_EMASK_(r_);
+        // MOE_SORTING_DISPATCH_ETILE(0, 0);
+#endif
    }
    return -1;
 }
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
@@ -11,6 +11,7 @@ struct moe_sorting_trait
 {
    std::string index_type;
    std::string weight_type;   // currently always float
+    bool local_expert_masking; // if mask experts as local expert
 };
 struct moe_sorting_args : public ck_tile::MoeSortingHostArgs

--- a/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
@@ -17,4 +17,12 @@ $EXE -t=71 -e=11 -k=11
 $EXE -t=1 -e=1 -k=1
 $EXE -t=99 -e=2 -k=1
 $EXE -t=333 -e=99 -k=13
+$EXE -t=11 -e=256 -k=5
+$EXE -t=64 -e=455 -k=8
+$EXE -t=777 -e=802 -k=99
+$EXE -t=4097 -e=906 -k=51
 $EXE -t=128 -e=32 -k=5 -moe_buf_size=262144
+$EXE -t=13 -e=64 -k=3 -local_eid=4,5,6,7,8,9,10,11
+$EXE -t=99 -e=33 -k=9 -local_eid=6,10,11,15,19
+$EXE -t=80 -e=99 -k=10 -local_eid=0,8,12,33
+$EXE -t=11 -e=256 -k=5 -local_eid=99,110,129
--- a/example/ck_tile/15_fused_moe/README.md
+++ b/example/ck_tile/15_fused_moe/README.md
@@ -42,7 +42,7 @@ summary of the key design of this fused-moe operator:
 //  (only for reference)    exp-0  exp-1     exp-2   exp-3          exp-4  exp-5
 // weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
 //
-// max_num_tokens_padded : topk * input_tokens + num_experts * (M_a - 1)
+// max_num_tokens_padded : topk * input_tokens + num_experts * M_a - topk (updated)
 // * this could be larger than actual, since actual tokens are on GPU
 //
 // sorted_token_ids_ptr   : [0, 6, 6, 6, 2, 3, 4, 6, 1, 3, 6, 6, 0, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 0, 1, 2, 5]

--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -3,6 +3,12 @@
 #include "fused_moesorting.hpp"
+#ifndef MOE_SORTING_USE_EX_KERNEL
+#define MOE_SORTING_USE_EX_KERNEL 1
+#endif
+#if !MOE_SORTING_USE_EX_KERNEL
 #define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_)                         \
    constexpr ck_tile::index_t unroll_num  = unroll_num_;                             \
    constexpr ck_tile::index_t expert_tile = expert_tile_;                            \
@@ -17,6 +23,24 @@
        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));    \
    return ave_time;
+#else
+#define MOE_SORTING_DISPATCH_(sub_token_tile_, sub_token_onshot_)                                \
+    constexpr ck_tile::index_t sub_token_tile = sub_token_tile_;                                 \
+    constexpr bool sub_token_onshot           = sub_token_onshot_;                               \
+    using ms_problem =                                                                           \
+        ck_tile::MoeSortingProblemEx<index_t, ms_weight_type, sub_token_tile, sub_token_onshot>; \
+    using kernel         = ck_tile::MoeSortingKernel<ms_problem>;                                \
+    auto kargs           = kernel::MakeKargs(a);                                                 \
+    const dim3 grids     = kernel::GridSize(a);                                                  \
+    const dim3 blocks    = kernel::BlockSize(a);                                                 \
+    const auto lds_bytes = kernel::GetSmemSize(a);                                               \
+    float ave_time       = ck_tile::launch_kernel(                                               \
+        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs));               \
+    return ave_time;
+#endif
+#if !MOE_SORTING_USE_EX_KERNEL
 #define MOE_SORTING_DISPATCH(unroll_num_)           \
    if(a.num_experts <= 8)                          \
    {                                               \
@@ -38,11 +62,13 @@
    {                                               \
        MOE_SORTING_DISPATCH_ETILE(unroll_num_, 0)  \
    }
+#endif
 float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s)
 {
    if(t.weight_type == "fp32" && t.index_type == "int32")
    {
+#if !MOE_SORTING_USE_EX_KERNEL
        if(a.num_experts > 127)
        {
            printf("lds size exceed, only support experts <127 \n");
@@ -83,6 +109,54 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
            MOE_SORTING_DISPATCH(4);
        }
        }
+#else
+        using index_t            = ck_tile::index_t;
+        using ms_weight_type     = float;
+        auto [r_, c_]            = ck_tile::moe_sorting_get_smem_row_col(a.tokens, a.num_experts);
+        auto sub_token_          = r_ - 2;
+        r_                       = (r_ - 2) / 8;
+        bool is_sub_token_onshot = a.tokens <= sub_token_;
+        (void)c_;
+        if(is_sub_token_onshot)
+        {
+            if(r_ % 8 == 0)
+            {
+                MOE_SORTING_DISPATCH_(8, true);
+            }
+            else if(r_ % 4 == 0)
+            {
+                MOE_SORTING_DISPATCH_(4, true);
+            }
+            else if(r_ % 2 == 0)
+            {
+                MOE_SORTING_DISPATCH_(2, true);
+            }
+            else
+            {
+                MOE_SORTING_DISPATCH_(1, true);
+            }
+        }
+        else
+        {
+            if(r_ % 8 == 0)
+            {
+                MOE_SORTING_DISPATCH_(8, false);
+            }
+            else if(r_ % 4 == 0)
+            {
+                MOE_SORTING_DISPATCH_(4, false);
+            }
+            else if(r_ % 2 == 0)
+            {
+                MOE_SORTING_DISPATCH_(2, false);
+            }
+            else
+            {
+                MOE_SORTING_DISPATCH_(1, false);
+            }
+        }
+        // MOE_SORTING_DISPATCH_ETILE(0, 0);
+#endif
    }
    return -1;
 }
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -79,8 +79,11 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
    if(s.log_level_ > 0)
    {
-        std::cout << "Launching kernel with args:"
+        std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << "shape: " << CodegenGemmShape::GetName() << '\n'
+                  << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                  << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                  << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
                  << std::endl;
    }

--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -212,7 +212,7 @@ int run_batched_gemm_example_with_layouts(int argc,
                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
                  << std::endl;
-        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
    }
    else if(arg_parser.get_int("v") == 2)
    {

--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -118,7 +118,7 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
    if(s.log_level_ > 0)
    {
-        std::cout << "Launching kernel with args:"
+        std::cout << "Launching kernel: " << GroupedGemmKernel::GetName() << " with args:"
                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
                  << std::endl;

--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -202,7 +202,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
                      << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
                      << std::endl;
        }
-        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
    }
    return pass;

--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -27,12 +27,12 @@
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/int8.hpp"
-#include "ck_tile/core/numeric/pk_int4.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/numeric/null_type.hpp"
 #include "ck_tile/core/numeric/numeric.hpp"
+#include "ck_tile/core/numeric/pk_int4.hpp"
 #include "ck_tile/core/numeric/type_convert.hpp"
 #include "ck_tile/core/numeric/vector_type.hpp"
 #include "ck_tile/core/tensor/buffer_view.hpp"

--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -5,6 +5,7 @@
 #include "ck_tile/host/arg_parser.hpp"
 #include "ck_tile/host/check_err.hpp"
+#include "ck_tile/host/concat.hpp"
 #include "ck_tile/host/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck_tile/host/convolution_parameter.hpp"
 #include "ck_tile/host/device_memory.hpp"

--- a/include/ck_tile/host/concat.hpp
+++ b/include/ck_tile/host/concat.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+namespace ck_tile {
+template <typename T>
+struct IsCharArray : std::false_type
+{
+};
+template <std::size_t N>
+struct IsCharArray<char[N]> : std::true_type
+{
+};
+template <std::size_t N>
+struct IsCharArray<const char[N]> : std::true_type
+{
+};
+template <std::size_t N>
+struct IsCharArray<char (&)[N]> : std::true_type
+{
+};
+template <std::size_t N>
+struct IsCharArray<const char (&)[N]> : std::true_type
+{
+};
+template <typename... Ts>
+inline constexpr bool AllConvertibleToStringView = ((std::is_convertible_v<Ts, std::string_view> ||
+                                                     IsCharArray<Ts>::value ||
+                                                     std::is_same_v<Ts, char>)&&...);
+template <typename... Ts>
+[[nodiscard]] auto concat(const Ts&... xs)
+    -> std::enable_if_t<!AllConvertibleToStringView<Ts...>, std::string>
+{
+    using ::operator<<;
+    thread_local std::ostringstream oss;
+    oss.str("");
+    (oss << ... << xs);
+    return oss.str();
+}
+template <std::size_t N>
+[[nodiscard]] constexpr inline std::size_t getSize(char (&)[N]) noexcept
+{
+    return N;
+}
+template <std::size_t N>
+[[nodiscard]] constexpr inline std::size_t getSize(const char (&)[N]) noexcept
+{
+    return N;
+}
+[[nodiscard]] constexpr inline std::size_t getSize(const char* s) noexcept
+{
+    const char* end = s;
+    while(*end++ != 0) {}
+    return end - s - 1;
+}
+[[nodiscard]] constexpr inline std::size_t getSize(const char&) noexcept { return 1; }
+[[nodiscard]] inline std::size_t getSize(const std::string& s) noexcept { return s.size(); }
+[[nodiscard]] constexpr inline std::size_t getSize(const std::string_view& s) noexcept
+{
+    return s.size();
+}
+template <typename... Ts>
+auto concatInto(std::string& result, const Ts&... xs)
+    -> std::enable_if_t<AllConvertibleToStringView<Ts...>, void>
+{
+    const std::size_t space = (1 + ... + getSize(xs));
+    result.reserve(result.size() + space);
+    ((result += xs), ...);
+}
+template <typename... Ts>
+[[nodiscard]] auto concat(const Ts&... xs)
+    -> std::enable_if_t<AllConvertibleToStringView<Ts...>, std::string>
+{
+    std::string result;
+    concatInto(result, xs...);
+    return result;
+}
+// Function for types convertible to std::string_view
+template <typename Sep, typename First, typename... Rest>
+[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
+    -> std::enable_if_t<AllConvertibleToStringView<First, Rest...>, std::string>
+{
+    std::string result;
+    result += first;
+    ((result += sep, result += rest), ...);
+    return result;
+}
+// Function for other types
+template <typename Sep, typename First, typename... Rest>
+[[nodiscard]] auto concat(Sep sep, const First& first, const Rest&... rest)
+    -> std::enable_if_t<!AllConvertibleToStringView<First, Rest...>, std::string>
+{
+    using ::operator<<;
+    thread_local std::ostringstream oss;
+    oss.str("");
+    oss << first;
+    ((oss << sep << rest), ...);
+    return oss.str();
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_moe_sorting.hpp
+++ b/include/ck_tile/host/reference/reference_moe_sorting.hpp
@@ -14,12 +14,15 @@ namespace ck_tile {
 template <typename WeightType, typename IndexType = index_t>
 CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
                                        const HostTensor<WeightType>& weights,
+                                        const HostTensor<IndexType>& local_expert_mask,
                                        HostTensor<IndexType>& p_sorted_token_ids,
                                        HostTensor<WeightType>& sorted_weight,
                                        HostTensor<IndexType>& sorted_expert_ids,
                                        index_t& unit_cnt,
                                        const index_t experts,
-                                        const index_t unit_size)
+                                        const index_t unit_size,
+                                        bool local_expert_masking,
+                                        bool skip_experts_with_zero_token = true)
 {
    const index_t num_token = topk_ids.mDesc.get_lengths()[0];
    const index_t topk      = topk_ids.mDesc.get_lengths()[1];
@@ -33,8 +36,11 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
 #endif
    std::vector<std::vector<WeightType>> expert_token_weights(
        experts, std::vector<WeightType>(unit_size, 0));
+    // count number of unit-size slices in this expert
    std::vector<IndexType> expert_slices(experts, 1);
+    // count the tokens used in this expert
    std::vector<IndexType> expert_slice_idxs(experts, 0);
+    // TODO: above 2 buffer seems duplicated
    for(index_t t = 0; t < num_token; t++)
    {
@@ -72,8 +78,23 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
    IndexType* out_tokens    = p_sorted_token_ids.data();
    WeightType* out_weights  = sorted_weight.data();
    IndexType* out_expert_id = sorted_expert_ids.data();
+    int curr_expert_id       = 0;
    for(index_t e = 0; e < experts; e++)
    {
+        if(local_expert_masking)
+        {
+            if(local_expert_mask(e) == 0)
+                continue;
+        }
+        if(skip_experts_with_zero_token)
+        {
+            if(expert_slice_idxs[e] == 0)
+            {
+                curr_expert_id++;
+                continue;
+            }
+        }
        memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size);
        out_tokens += expert_slices[e] * unit_size;
        memcpy(out_weights,
@@ -83,10 +104,11 @@ CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
        for(index_t s = 0; s < expert_slices[e]; s++)
        {
-            out_expert_id[s] = e;
+            out_expert_id[s] = curr_expert_id;
            unit_cnt++;
        }
        out_expert_id += expert_slices[e];
+        curr_expert_id++;
    }
    unit_cnt *= unit_size;
    return;

--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
@@ -10,3 +10,4 @@
 #include "ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"
--- a/include/ck_tile/ops/batched_transpose.hpp
+++ b/include/ck_tile/ops/batched_transpose.hpp
@@ -9,3 +9,4 @@
 #include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"
--- a/include/ck_tile/ops/common.hpp
+++ b/include/ck_tile/ops/common.hpp
@@ -5,3 +5,4 @@
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"