Merge branch 'develop' of https://github.com/ROCm/composable_kernel into update_cka8w8

f3bbfe3e · aska-0096 · 2b840f5a · efb34741 · f3bbfe3e · f3bbfe3e
Commit f3bbfe3e authored Nov 18, 2024 by aska-0096
20 changed files
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/moe_sorting.hpp"
+struct moe_sorting_trait
+{
+    std::string index_type;
+    std::string weight_type; // currently always float
+};
+struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
+{
+};
+float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
--- a/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+# #!/bin/sh
+EXE=./build/bin/tile_example_moe_sorting
+$EXE -t=80 -e=17 -moe_buf_size=16
+$EXE -t=111 -e=117 -moe_buf_size=4
+$EXE -t=1000 -e=55 -moe_buf_size=1024
+$EXE -t=99 -e=120  -moe_buf_size=10244
+$EXE -t=175 -e=64 -k=8
+$EXE -t=65 -e=8 -k=2
+$EXE -t=1 -e=25
+$EXE -t=31 -e=19 -k=15
+$EXE -t=81 -e=37 -k=7
+$EXE -t=23 -e=1 -k=1
+$EXE -t=127 -e=99 -k=19
+$EXE -t=71 -e=11 -k=11
+$EXE -t=1 -e=1 -k=1
+$EXE -t=99 -e=2 -k=1
+$EXE -t=333 -e=99 -k=13
\ No newline at end of file
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -12,3 +12,4 @@ add_subdirectory(09_topk_softmax)
 add_subdirectory(10_rmsnorm2d)
 add_subdirectory(11_add_rmsnorm2d_rdquant)
 add_subdirectory(12_smoothquant)
+add_subdirectory(13_moe_sorting)
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -63,13 +63,15 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
 #define __gfx101__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
+    defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+    defined(__gfx1103__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
 #define __gfx12__
 #endif

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -381,10 +381,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                        {
                            tildes = {i_ztilde, i_ytilde, i_xtilde};
                        }
-                        else
-                        {
-                            throw std::runtime_error("wrong! only implemented for 2D and 3D now");
-                        }
                        const auto a_grid_desc_ak0_m_ak1 =
                            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
@@ -750,6 +746,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
            }
        }
+        // check number of dimension, only implemented for 2D and 3D now
+        if(NDimSpatial != 2 && NDimSpatial != 3)
+        {
+            return false;
+        }
        return true;
    }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -93,12 +93,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -60,12 +60,12 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -117,12 +117,12 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    // Pass two lds pointer is the key to tell compiler that ds_read/write
    // operate on different lds chunk at same time without order dependecy

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -98,12 +98,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t c_batch_offset =
+    const long_index_t c_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -60,12 +60,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
@@ -155,12 +155,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -121,10 +121,10 @@ struct GridwiseTensorRearrange
            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
        // Global Memory
-        const index_t a_batch_offset =
+        const index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-        const index_t c_batch_offset =
+        const index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());

--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -9,7 +9,8 @@
 // TODO: Add arch limitation
 namespace ck {
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+    defined(__gfx1103__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
 /********************************WAVE32 MODE***********************************************/
@@ -260,7 +261,7 @@ struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp>
 // gfx12
 /********************************WAVE32 MODE***********************************************/
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
 #define __gfx12__
 #endif

--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -11,13 +11,15 @@
 #define __gfx94__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
+    defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+    defined(__gfx1103__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
 #define __gfx12__
 #endif

--- a/include/ck_tile/core/tensor/shuffle_tile.hpp
+++ b/include/ck_tile/core/tensor/shuffle_tile.hpp
@@ -170,7 +170,7 @@ CK_TILE_DEVICE void shuffle_tile(OutTensor& out, const InTensor& in)
    }
    else
    {
-        // NOT implemented
+        static_assert(false, "The shuffle should always happen!");
    }
 }

--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -23,6 +23,7 @@
 #include "ck_tile/host/reference/reference_gemm.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
 #include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
+#include "ck_tile/host/reference/reference_moe_sorting.hpp"
 #include "ck_tile/host/reference/reference_permute.hpp"
 #include "ck_tile/host/reference/reference_reduce.hpp"
 #include "ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp"

--- a/include/ck_tile/host/reference/reference_moe_sorting.hpp
+++ b/include/ck_tile/host/reference/reference_moe_sorting.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+namespace ck_tile {
+template <typename WeightType, typename IndexType = index_t>
+CK_TILE_HOST void reference_moe_sorting(const HostTensor<IndexType>& topk_ids,
+                                        const HostTensor<WeightType>& weights,
+                                        HostTensor<IndexType>& p_sorted_token_ids,
+                                        HostTensor<WeightType>& sorted_weight,
+                                        HostTensor<IndexType>& sorted_expert_ids,
+                                        index_t& unit_cnt,
+                                        const index_t experts,
+                                        const index_t unit_size)
+{
+    const index_t num_token = topk_ids.mDesc.get_lengths()[0];
+    const index_t topk      = topk_ids.mDesc.get_lengths()[1];
+    std::vector<std::vector<IndexType>> expert_tokens(experts,
+                                                      std::vector<IndexType>(unit_size, num_token));
+    std::vector<std::vector<WeightType>> expert_token_weights(
+        experts, std::vector<WeightType>(unit_size, 0));
+    std::vector<IndexType> expert_slices(experts, 1);
+    std::vector<IndexType> expert_slice_idxs(experts, 0);
+    for(index_t t = 0; t < num_token; t++)
+    {
+        for(index_t k = 0; k < topk; k++)
+        {
+            IndexType e  = topk_ids(t, k);
+            WeightType w = weights(t, k);
+            index_t idx  = expert_slice_idxs[e];
+            if(idx > expert_slices[e] * unit_size - 1)
+            {
+                expert_slices[e]++;
+                index_t new_size = expert_slices[e] * unit_size;
+                expert_tokens[e].resize(new_size);
+                expert_token_weights[e].resize(new_size);
+                for(index_t i = (expert_slices[e] - 1) * unit_size; i < new_size; i++)
+                {
+                    expert_tokens[e][i]        = num_token;
+                    expert_token_weights[e][i] = 0;
+                }
+            }
+            expert_tokens[e][idx]        = t;
+            expert_token_weights[e][idx] = w;
+            expert_slice_idxs[e]++;
+        }
+    }
+    IndexType* out_tokens    = p_sorted_token_ids.data();
+    WeightType* out_weights  = sorted_weight.data();
+    IndexType* out_expert_id = sorted_expert_ids.data();
+    for(index_t e = 0; e < experts; e++)
+    {
+        memcpy(out_tokens, expert_tokens[e].data(), sizeof(index_t) * expert_slices[e] * unit_size);
+        out_tokens += expert_slices[e] * unit_size;
+        memcpy(out_weights,
+               expert_token_weights[e].data(),
+               sizeof(WeightType) * expert_slices[e] * unit_size);
+        out_weights += expert_slices[e] * unit_size;
+        for(index_t s = 0; s < expert_slices[e]; s++)
+        {
+            out_expert_id[s] = e;
+            unit_cnt++;
+        }
+        out_expert_id += expert_slices[e];
+    }
+    unit_cnt *= unit_size;
+    return;
+}
+} // namespace ck_tile
--- a/include/ck_tile/ops/common/generic_2d_block_shape.hpp
+++ b/include/ck_tile/ops/common/generic_2d_block_shape.hpp
@@ -38,9 +38,7 @@ namespace ck_tile {
 template <typename BlockTile_,    // block size, seq<M, N>
          typename WarpPerBlock_, // num warps along seq<M, N>
          typename WarpTile_,     // warp size, seq<M, N>
-          typename Vector_,       // contiguous pixels(vector size) along seq<M, N>
+          typename Vector_>       // contiguous pixels(vector size) along seq<M, N>)>
-          index_t BlockSize_ =
-              warpSize* reduce_on_sequence(WarpPerBlock_{}, multiplies{}, number<1>{})>
 struct Generic2dBlockShape
 {
    // block size
@@ -70,8 +68,10 @@ struct Generic2dBlockShape
    // num of threads along seq<M, N>, within each warp
    static constexpr index_t ThreadPerWarp_M  = Warp_M / Vector_M;
    static constexpr index_t ThreadPerWarp_N  = Warp_N / Vector_N;
+    static constexpr index_t ThreadPerBlock_M = Block_M / Repeat_M / Vector_M;
+    static constexpr index_t ThreadPerBlock_N = Block_N / Repeat_N / Vector_N;
-    static constexpr index_t BlockSize = BlockSize_;
+    static constexpr index_t BlockSize = ThreadPerBlock_M * ThreadPerBlock_N;
 };
 } // namespace ck_tile
--- a/include/ck_tile/ops/fmha/block/page_block_navigator.hpp
+++ b/include/ck_tile/ops/fmha/block/page_block_navigator.hpp
@@ -230,7 +230,15 @@ struct PageBlockNavigator
    CK_TILE_HOST_DEVICE
    DataType* get_block_ptr(index_t block_index) const
    {
-        return physical_blocks + physical_block_indices[block_index] * block_stride + fixed_offset;
+        if(block_index < num_blocks)
+        {
+            return physical_blocks + physical_block_indices[block_index] * block_stride +
+                   fixed_offset;
+        }
+        else
+        {
+            return nullptr;
+        }
    }
    CK_TILE_HOST_DEVICE int32_t get_block_index(const WindowOrigin& global_window_origin) const

--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -863,6 +863,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
            constexpr index_t K0 = kKPerBlock / K1;
            constexpr index_t N2 = get_warp_size() / K0;
            constexpr index_t N1 = kBlockSize / get_warp_size();
+            static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
+            static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
            constexpr index_t N0 = kNPerBlock / (N2 * N1);
            static_assert(N0 != 0);

--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <string>
+#include <type_traits>
+namespace ck_tile {
+struct MoeSortingHostArgs
+{
+    const void* p_topk_ids;
+    const void* p_weights;
+    void* p_sorted_token_ids;
+    void* p_sorted_weights;
+    void* p_sorted_expert_ids;
+    void* p_total_tokens_post_pad;
+    void* p_moe_buf;
+    index_t tokens;
+    index_t unit_size;
+    index_t num_experts;
+    index_t topk;
+    index_t moe_buf_bytes;
+};
+template <typename Problem_>
+struct MoeSortingKernel
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+    typedef MoeSortingHostArgs MoeSortingKargs;
+    using Hargs = MoeSortingHostArgs;
+    struct Kargs
+    {
+        const void* p_topk_ids;
+        const void* p_weights;
+        void* p_sorted_token_ids;
+        void* p_sorted_weights;
+        void* p_sorted_expert_ids;
+        void* p_total_tokens_post_pad;
+        void* p_moe_buf;
+        index_t tokens;
+        index_t num_experts;
+        index_t moe_buf_bytes;
+        index_t tokens_per_thread;
+        mdiv unit_size_mdiv;
+        mdiv topk_mdiv;
+    };
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        // TODO: assume num-experts not too much
+        return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BlockSize(h).x * 16));
+    }
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs& h)
+    {
+        return dim3(ck_tile::integer_least_multiple(h.num_experts, ck_tile::get_warp_size()));
+    }
+    // in byte
+    CK_TILE_HOST static constexpr auto GetSmemSize(const Hargs& h)
+    {
+        const auto blocks = BlockSize(h);
+        return ((blocks.x + 1) * h.num_experts + (h.num_experts + 1)) * sizeof(index_t);
+    }
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_topk_ids              = h.p_topk_ids;
+        k.p_weights               = h.p_weights;
+        k.p_sorted_token_ids      = h.p_sorted_token_ids;
+        k.p_sorted_weights        = h.p_sorted_weights;
+        k.p_sorted_expert_ids     = h.p_sorted_expert_ids;
+        k.p_moe_buf               = h.p_moe_buf;
+        k.p_total_tokens_post_pad = h.p_total_tokens_post_pad;
+        k.tokens                  = h.tokens;
+        k.num_experts             = h.num_experts;
+        k.moe_buf_bytes           = h.moe_buf_bytes;
+        const auto blocks   = BlockSize(h);
+        k.tokens_per_thread = integer_divide_ceil(h.tokens * h.topk, blocks.x);
+        k.unit_size_mdiv    = mdiv{static_cast<uint32_t>(h.unit_size)};
+        k.topk_mdiv         = mdiv{static_cast<uint32_t>(h.topk)};
+        return k;
+    }
+    CK_TILE_DEVICE index_t calc_index(index_t total_col, index_t row, index_t col) const
+    {
+        return row * total_col + col;
+    }
+    CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, index_t buf_bytes) const
+    {
+        const index_t offset = (blockIdx.x - 1) * blockDim.x + threadIdx.x;
+        if(offset < buf_bytes / 16)
+        {
+            buf[offset] = uint8x16_t{0};
+        }
+    }
+    CK_TILE_DEVICE void moe_align_block_size_kernel(const IndexType* __restrict__ topk_id,
+                                                    const WeightType* __restrict__ weights,
+                                                    index_t* p_sorted_token_ids,
+                                                    WeightType* p_sorted_weights,
+                                                    index_t* p_sorted_expert_ids,
+                                                    index_t* p_total_tokens_post_pad,
+                                                    const index_t num_experts,
+                                                    const index_t tokens_per_thread,
+                                                    const index_t numel,
+                                                    const mdiv unit_size_mdiv,
+                                                    const mdiv topk_mdiv,
+                                                    void* smem) const
+    {
+        const index_t tid       = static_cast<index_t>(threadIdx.x);
+        const index_t start_idx = tid * tokens_per_thread;
+        index_t* shared_mem = reinterpret_cast<index_t*>(smem);
+        index_t* tokens_cnts = shared_mem; // 2d: (blockDim.x + 1, num_experts)
+        index_t* cumsum      = shared_mem + (blockDim.x + 1) * num_experts; // 1: (num_experts + 1)
+        for(int i = 0; i < num_experts; ++i)
+        {
+            tokens_cnts[calc_index(num_experts, tid + 1, i)] = 0;
+        }
+#pragma unroll Problem_::InternalLoadUnroll
+        for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i)
+        {
+            ++tokens_cnts[calc_index(num_experts, tid + 1, topk_id[i])];
+        }
+        __syncthreads();
+        if(tid < num_experts)
+        {
+            tokens_cnts[calc_index(num_experts, 0, tid)] = 0;
+            for(int i = 1; i <= static_cast<index_t>(blockDim.x); ++i)
+            {
+                tokens_cnts[calc_index(num_experts, i, tid)] +=
+                    tokens_cnts[calc_index(num_experts, i - 1, tid)];
+            }
+        }
+        // __syncthreads();
+        if(tid == 0)
+        {
+            cumsum[0] = 0;
+            for(int i = 1; i <= num_experts; ++i)
+            {
+                auto current_units = [&]() {
+                    index_t x_ = tokens_cnts[calc_index(num_experts, blockDim.x, i - 1)] +
+                                 unit_size_mdiv.divisor - 1;
+                    index_t y_ = unit_size_mdiv.div(x_);
+                    return max(y_, 1) * unit_size_mdiv.divisor;
+                }();
+                cumsum[i] = cumsum[i - 1] + current_units;
+            }
+            *p_total_tokens_post_pad = cumsum[num_experts];
+        }
+        __syncthreads();
+        if(tid < num_experts)
+        {
+            for(int i = cumsum[tid]; i < cumsum[tid + 1]; i += unit_size_mdiv.divisor)
+            {
+                p_sorted_expert_ids[unit_size_mdiv.div(i)] = tid;
+            }
+        }
+#pragma unroll Problem_::InternalLoadUnroll
+        for(int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i)
+        {
+            index_t expert_id = topk_id[i];
+            index_t rank_post_pad =
+                tokens_cnts[calc_index(num_experts, tid, expert_id)] + cumsum[expert_id];
+            p_sorted_token_ids[rank_post_pad] = topk_mdiv.div(i);
+            p_sorted_weights[rank_post_pad]   = weights[i];
+            ++tokens_cnts[calc_index(num_experts, tid, expert_id)];
+        }
+        const index_t prefill_token = topk_mdiv.div(numel);
+        if(tid < num_experts)
+        {
+            index_t expert_offset =
+                cumsum[tid] + tokens_cnts[calc_index(num_experts, blockDim.x, tid)];
+            while(expert_offset < cumsum[tid + 1])
+            {
+                p_sorted_token_ids[expert_offset] = prefill_token;
+                p_sorted_weights[expert_offset]   = static_cast<WeightType>(0.0);
+                expert_offset++;
+            }
+        }
+    }
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        if(blockIdx.x > 0)
+        {
+            if(kargs.p_moe_buf)
+            {
+                moe_buf_set_zero_kernel(reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
+                                        kargs.moe_buf_bytes);
+            }
+            return;
+        }
+        const size_t numel = kargs.tokens * kargs.topk_mdiv.divisor;
+        extern __shared__ char smem[];
+        return moe_align_block_size_kernel(static_cast<const IndexType*>(kargs.p_topk_ids),
+                                           static_cast<const WeightType*>(kargs.p_weights),
+                                           static_cast<IndexType*>(kargs.p_sorted_token_ids),
+                                           static_cast<WeightType*>(kargs.p_sorted_weights),
+                                           static_cast<IndexType*>(kargs.p_sorted_expert_ids),
+                                           static_cast<IndexType*>(kargs.p_total_tokens_post_pad),
+                                           kargs.num_experts,
+                                           kargs.tokens_per_thread,
+                                           numel,
+                                           kargs.unit_size_mdiv,
+                                           kargs.topk_mdiv,
+                                           smem);
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_policy.hpp"
+#include <string>
+#include <type_traits>
+#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
+#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 0
+#endif
+namespace ck_tile {
+// template <typename Problem_, typename Policy_ = MoeSortingPolicy>
+// struct MoeSortingPipeline
+// {
+//     // TODO: this kernel only support warp per row
+//     using Problem    = remove_cvref_t<Problem_>;
+//     using Policy     = remove_cvref_t<Policy_>;
+//     using WeightType = typename Problem::WeightType;
+//     template <typename TopkIdWindow, typename WeightWindow>
+//     CK_TILE_DEVICE auto operator()(const TopkIdWindow& topk_id_window,
+//                                    const WeightWindow& weight_window,
+//                                     index_t* p_sorted_token_ids,
+//                                     WeightType* p_sorted_weights,
+//                                     index_t* p_sorted_expert_ids,
+//                                     index_t* p_total_tokens_post_pad,
+//                                     const index_t num_experts,
+//                                     const index_t unit_size,
+//                                     const size_t numel,
+//                                     const index_t topk)
+//     {
+//     }
+// };
+} // namespace ck_tile