results ok

00627fed · coderfeli · 6b51413b · 00627fed · 00627fed · 00627fed
Commit 00627fed authored Feb 04, 2025 by coderfeli
9 changed files
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -17,7 +17,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/utility/blkgemmpipe_scheduler.hpp"
@@ -26,7 +26,7 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 using F16  = ck::half_t;
-using BF16 = ck::bhalf_t;
+// using BF16 = ck::bhalf_t;
 // using F16  = ck::f8_t;
 using F32  = float;
@@ -61,41 +61,23 @@ struct MultiplyMultiply
                                                                            const float& d0,
                                                                            const float& d1) const
    {
-        const float x0_f = c * d0 * d1;
+        // const float x0_f = c * d0 * d1;
+        const float x0_f = c;
+        // printf("epi %f\n", c);
        e = ck::type_convert<F16>(x0_f);
    }
-    template <>
+    // template <>
-    __host__ __device__ constexpr void operator()<BF16, float, float, float>(BF16& e,
+    // __host__ __device__ constexpr void operator()<BF16, float, float, float>(BF16& e,
-                                                                             const float& c,
+    //                                                                          const float& c,
-                                                                             const float& d0,
+    //                                                                          const float& d0,
-                                                                             const float& d1) const
+    //                                                                          const float& d1) const
-    {
+    // {
-        const float x0_f = c * d0 * d1;
+    //     const float x0_f = c;
+    //     // const float x0_f = c * d0 * d1;
-        e = ck::type_convert<BF16>(x0_f);
-    }
+    //     e = ck::type_convert<BF16>(x0_f);
+    // }
-    template <>
-    __host__ __device__ constexpr void operator()<ck::half_t, int, float, float>(
-        ck::half_t& e, const int& c, const float& d0, const float& d1) const
-    {
-        const float x0_f =
-            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
-        e = ck::type_convert<ck::half_t>(x0_f);
-    }
-    template <>
-    __host__ __device__ constexpr void operator()<ck::bhalf_t, int, float, float>(
-        ck::bhalf_t& e, const int& c, const float& d0, const float& d1) const
-    {
-        const float x0_f =
-            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
-        e = ck::type_convert<ck::bhalf_t>(x0_f);
-    }
 };
 void preShuffleBuffer(const F16* src, F16* dst, int N, int K, int NXdl)
@@ -153,8 +135,8 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
               8,   8,
               32,   32,
               1,    1,
-               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1,
-               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1,
               //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
               //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
@@ -169,46 +151,32 @@ int main(int argc, char* argv[])
 {
    bool do_verification = true;
    int init_method      = 1;
-    bool time_kernel     = false;
+    bool time_kernel     = true;
+// tokens = 1
+// topk = 1
+// experts = 8
+// per expert: 
    // GEMM shape
-    ck::index_t M = 3840;
    ck::index_t N = 4096;
    ck::index_t K = 4096;
+    ck::index_t experts = 8;
-    ck::index_t StrideA = K;
+    ck::index_t sorted_tile_num = 8;
-    ck::index_t StrideB = K;
+    ck::index_t sorted_tile_size = 32;
-    ck::index_t StrideD = 0;
+    ck::index_t SORTED_SIZE = sorted_tile_num * sorted_tile_size;
-    ck::index_t StrideE = N;
+    ck::index_t tokens = 32;
-    ck::index_t KBatch = 1;
    if(argc == 1)
    {
        // use default case
    }
-    else if(argc == 4)
+    else if(argc == 6)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
-    }
+        N = std::stoi(argv[4]);
-    else if(argc == 12)
+        K = std::stoi(argv[5]);
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        M = std::stoi(argv[4]);
-        N = std::stoi(argv[5]);
-        K = std::stoi(argv[6]);
-        StrideA = std::stoi(argv[7]);
-        StrideB = std::stoi(argv[8]);
-        StrideD = std::stoi(argv[9]);
-        StrideE = std::stoi(argv[10]);
-        KBatch = std::stoi(argv[11]);
    }
    else
    {
@@ -216,10 +184,18 @@ int main(int argc, char* argv[])
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=no, 1=yes)\n");
        printf(
-            "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, KBatch\n");
+            "arg4 to 5: N, K\n");
        exit(0);
    }
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideD = 0;
+    ck::index_t StrideE = N;
+    ck::index_t KBatch = 1;
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            using namespace ck::literals;
@@ -233,66 +209,75 @@ int main(int argc, char* argv[])
                return HostTensorDescriptor({row, col}, {1_uz, stride});
            }
        };
-    const ck::index_t experts = 8;
+    // const ck::index_t experts = 8;
    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({experts}, {1}));
-    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({M}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({SORTED_SIZE}, {1}));
-    for (int i = 0; i < experts; i++) {
+    for (int i = 0; i < sorted_tile_num; i++) {
        expert_ids.mData[i] = i;
    }
+    int token_per_tile = tokens / sorted_tile_num;
-    for (int i = 0; i < M; i++) {
+    int tokenid = 0;
-        sorted_token_ids.mData[i] = i % (M / 2);
+    // sorted_token_ids.mData[0] = 0;
+    for (int i = 0; i < SORTED_SIZE; i++) {
+        int tile_off = i % sorted_tile_size;
+        if(tile_off < token_per_tile)
+            sorted_token_ids.mData[i] = tokenid++;
+        else
+            sorted_token_ids.mData[i] = tokens;
    }
-    Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    Tensor<A0DataType> a0_t_k(HostTensorDescriptor({tokens, K}, {K, 1}));
-    Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N * experts, StrideB, B0Layout{}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, N, K}, {N*K, K, 1}));
-    Tensor<B0DataType> b0_preshuffled(
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, N, K}, {N*K, K, 1}));
-        f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); // use laout only for size
+    // Tensor<B0DataType> b0_e_n_k(f_host_tensor_descriptor(K, N * experts, StrideB, B0Layout{}));
-    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD, D0Layout{}));
+    // Tensor<B0DataType> b0_preshuffled(
-    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD, D1Layout{}));
+    //     f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); // use laout only for size
-    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<D0DataType> d0_t_n(f_host_tensor_descriptor(tokens, N, StrideD, D0Layout{}));
-    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<D1DataType> d1_t_n(f_host_tensor_descriptor(tokens, N, StrideD, D1Layout{}));
+    Tensor<B0DataType> e_m_n_host_result(HostTensorDescriptor({SORTED_SIZE, N}, {N, 1}));
-    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    Tensor<B0DataType> e_m_n_device_result(HostTensorDescriptor({SORTED_SIZE, N}, {N, 1}));
-    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
-    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "a0_t_k: " << a0_t_k.mDesc << std::endl;
-    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "d1_t_n: " << d1_t_n.mDesc << std::endl;
+    std::cout << "d0_t_n: " << d0_t_n.mDesc << std::endl;
    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
    switch(init_method)
    {
    case 0: break;
    case 1:
-        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        a0_t_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
-        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{0, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{0, 2});
-        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
-        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
+        d1_t_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
        break;
    case 2:
-        a0_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        a0_t_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
-        b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
-        d0_m_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
-        d1_m_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d1_t_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
        break;
    default:
-        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        a0_t_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
-        b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
-        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d0_t_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
-        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d1_t_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
    }
    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) * sorted_token_ids.mDesc.GetElementSpaceSize());
    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
-    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_t_k.mDesc.GetElementSpaceSize());
-    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
-    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_t_n.mDesc.GetElementSpaceSize());
-    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_t_n.mDesc.GetElementSpaceSize());
    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+    a0_t_k.savetxt("a.txt");
    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
    expert_ids_dev.ToDevice(expert_ids.mData.data());
-    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    a0_device_buf.ToDevice(a0_t_k.mData.data());
-    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d0_device_buf.ToDevice(d0_t_n.mData.data());
-    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_t_n.mData.data());
    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
    auto a_element_op   = AElementOp{};
@@ -308,7 +293,7 @@ int main(int argc, char* argv[])
    int NPerXdl = device_op.GetPreShuffleParameters();
-    preShuffleBuffer(b0_k_n.mData.data(), b0_preshuffled.mData.data(), N, K, NPerXdl);
+    preShuffleBuffer(b0_e_n_k.mData.data(), b0_preshuffled.mData.data(), N * experts, K, NPerXdl);
    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
@@ -321,7 +306,8 @@ int main(int argc, char* argv[])
                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
                                                                   d1_device_buf.GetDeviceBuffer()},
                               e_device_buf.GetDeviceBuffer(),
-                               M,
+                               tokens,
+                               SORTED_SIZE,
                               N,
                               K,
                               StrideA,
@@ -339,53 +325,56 @@ int main(int argc, char* argv[])
            "wrong! device_gemm with the specified compilation parameters does "
            "not support this GEMM problem");
    }
+    if (time_kernel) {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+        std::size_t flop = std::size_t(2) * SORTED_SIZE * N * K * experts;
+        std::size_t num_btype =
-    std::size_t flop = std::size_t(2) * M * N * K;
+            sizeof(A0DataType) * SORTED_SIZE * K + sizeof(B0DataType) * K * N * experts + sizeof(EDataType) * SORTED_SIZE * N;
-    std::size_t num_btype =
-        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
+                << std::endl;
+    }
    if(do_verification)
    {
-        invoker.Run(argument, StreamConfig{nullptr, false});
+        invoker.Run(argument, StreamConfig{nullptr, false, 0 ,0,1});
        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
-        Tensor<CShuffleDataType> c_m_n({M, N});
+        Tensor<CShuffleDataType> c_m_n({SORTED_SIZE, N});
-        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMoeGemm<A0DataType,
                                                                                B0DataType,
                                                                                CShuffleDataType,
                                                                                AccDataType,
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                PassThrough>;
-        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_moe_gemm           = ReferenceGemmInstance{};
-        auto ref_invoker            = ref_gemm.MakeInvoker();
+        auto ref_invoker            = ref_moe_gemm.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
+        auto ref_argument = ref_moe_gemm.MakeArgument(
-            a0_m_k, b0_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+           sorted_token_ids, expert_ids, a0_t_k, b0_e_n_k, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < SORTED_SIZE; ++m)
-        for(int m = 0; m < M; ++m)
        {
+            const int t = sorted_token_ids(m);
            for(int n = 0; n < N; ++n)
            {
-                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_t_n(t, n), d1_t_n(t, n));
            }
        }
        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+        e_m_n_device_result.savetxt("out.txt");
+        e_m_n_host_result.savetxt("ref.txt");
        return ck::utils::check_err(
                   e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
                   ? 0

--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -6,6 +6,7 @@
 #include <algorithm>
 #include <cassert>
 #include <iostream>
+#include <fstream>
 #include <numeric>
 #include <thread>
 #include <utility>
@@ -313,7 +314,32 @@ struct Tensor
    explicit Tensor(const Tensor<FromT>& other) : Tensor(other.template CopyAsType<T>())
    {
    }
+    void savetxt(std::string file_name, std::string dtype = "float")
+        {
+            std::ofstream file(file_name);
+            if(file.is_open())
+            {
+                for(auto& itm : mData)
+                {
+                    if(dtype == "float")
+                        file << ck::type_convert<float>(itm) << std::endl;
+                    else if(dtype == "int")
+                        file << ck::type_convert<int>(itm) << std::endl;
+                    else
+                        // TODO: we didn't implement operator<< for all custom
+                        // data types, here fall back to float in case compile error
+                        file << ck::type_convert<float>(itm) << std::endl;
+                }
+                file.close();
+            }
+            else
+            {
+                // Print an error message to the standard error
+                // stream if the file cannot be opened.
+                throw std::runtime_error(std::string("unable to open file:") + file_name);
+            }
+        }
    decltype(auto) GetLengths() const { return mDesc.GetLengths(); }
    decltype(auto) GetStrides() const { return mDesc.GetStrides(); }

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
@@ -305,6 +305,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    // printf("bid %d tid %d %f %f\n", blockIdx.x, threadIdx.x,
+                    //     type_convert<float>(a_thread_buf[I0]),
+                    //     type_convert<float>(b_thread_bufs[mfma_reg_buf][I0]));
                    static_for<0, MRepeat, 1>{}([&](auto m0) {
                        static_for<0, NRepeat, 1>{}([&](auto n0) {
                            static_for<0, KRepeat, 1>{}([&](auto k0) {
@@ -320,7 +323,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
                                                     [Number<b_thread_desc_.CalculateOffset(
                                                         make_tuple(n0, I0, k0, ik))>{}];
                                });
                                using mfma_input_type =
                                    typename vector_type<ComputeDataType,
                                                         xdlops_gemm.K1PerXdlops>::type;

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -52,7 +52,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1
    __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1(
        const SrcDesc& src_desc,
-        const Index& src_block_slice_origin, 
+        const Index& src_block_slice_origin,
        const SrcElementwiseOperation& src_element_op,
        const DstDesc& dst_desc,
        const Index& dst_block_slice_origin,
@@ -83,7 +83,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1
           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
        {
            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
-                make_multi_index(ThreadGroup::GetThreadId() % 8));
+                make_multi_index(ThreadGroup::GetThreadId()));
            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
@@ -100,7 +100,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1
           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
        {
            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
-                make_multi_index(ThreadGroup::GetThreadId() % 8));
+                make_multi_index(ThreadGroup::GetThreadId()));
            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -412,6 +412,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
                             const void* p_b,
                             std::array<const void*, NumDTensor> p_ds,
                             void* p_c,
+                             index_t NumTokens,
                             index_t M,
                             index_t N,
                             index_t K,
@@ -430,6 +431,7 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
                        static_cast<const BDataType*>(p_b),
                        p_ds,
                        static_cast<CDataType*>(p_c),
+                        NumTokens,
                        M,
                        N,
                        K,
@@ -461,14 +463,16 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
                                                      index_t KBatch,
                                                      AElementwiseOperation a_element_op,
                                                      BElementwiseOperation b_element_op,
-                                                      CElementwiseOperation c_element_op) override
+                                                      CElementwiseOperation c_element_op)
    {
+        // assert(0, "no impl");
        return std::make_unique<Argument>(nullptr, nullptr, 
                                         static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
                                          p_ds,
                                          static_cast<CDataType*>(p_c),
                                          M,
+                                          M,
                                          N,
                                          K,
                                          StrideA,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -9,7 +9,7 @@
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp"
-#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_mod8.hpp"
 #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
 #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -175,6 +175,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
    static constexpr index_t NLane   = NPerXdl;
    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
    static_assert(NWave * warpSize == BlockSize);
+    // static constexpr index_t NumTokens = 1;
    static constexpr index_t Experts   = 8;
    static constexpr index_t SortedTileSize   = 32;
@@ -513,7 +514,8 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
    struct Problem
    {
-        __host__ __device__ Problem(index_t M_,
+        __host__ __device__ Problem(index_t NumTokens_,
+                                    index_t M_,
                                    index_t N_,
                                    index_t K_,
                                    index_t StrideA_,
@@ -521,7 +523,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                                    std::array<index_t, NumDTensor> StrideDs_,
                                    index_t StrideC_,
                                    index_t KBatch_)
-            : M{M_},
+            : 
+              NumTokens{NumTokens_},
+              M{M_},
              N{N_},
              K{K_},
              StrideA{StrideA_},
@@ -545,6 +549,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
        __host__ void Print() const
        {
            std::cout << "problem {"
+                      << "NumTokens:" << NumTokens << ", "
                      << "M:" << M << ", "
                      << "N:" << N << ", "
                      << "K:" << K << ", "
@@ -561,6 +566,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                      << "NBlock: " << NBlock << "}" << std::endl;
        }
+        index_t NumTokens;
        index_t M;
        index_t N;
        index_t K;
@@ -592,6 +598,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                          const BDataType* p_b_grid_,
                          std::array<const void*, NumDTensor> p_ds_grid_,
                          CDataType* p_c_grid_,
+                          index_t NumTokens_,
                          index_t M_,
                          index_t N_,
                          index_t K_,
@@ -603,7 +610,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                          AElementwiseOperation a_element_op_,
                          BElementwiseOperation b_element_op_,
                          CElementwiseOperation c_element_op_)
-            : Problem{M_, N_, K_, StrideA_, StrideB_, StrideDs_, StrideC_, k_batch_},
+            : Problem{NumTokens_, M_, N_, K_, StrideA_, StrideB_, StrideDs_, StrideC_, k_batch_},
        p_sorted_token_ids{p_sorted_token_ids_},
        p_sorted_expert_ids{p_sorted_expert_ids_},
@@ -1103,13 +1110,14 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
    {
        ignore                           = b_element_op;
        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
-            problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
+            problem.NumTokens, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
        const auto b_grid_desc_bpreshuffled =
            MakeBGridDescriptor_Preshuffled(problem.BN0Shuffled, problem.BK0Shuffled);
        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
            problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
+        // printf("tido %d size %d %d MNBLOCK %d %d %d %d\n", threadIdx.x, problem.StrideC, c_grid_desc_m_n.GetElementSpaceSize(),
+        // problem.MBlock, problem.NBlock, MPerBlock, NPerBlock);
        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
@@ -1125,20 +1133,23 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
        static_assert(MLoadRepeats == 1, "only support 1 line per thread now!");
        const index_t token_pos = block_m_id * MPerBlock + threadIdx.x / KLoadThreads;
-        index_t token_offset = __builtin_amdgcn_readfirstlane(p_sorted_token_ids[token_pos]);
+        index_t token_offset = p_sorted_token_ids[token_pos];
        const index_t m_block_data_idx_on_grid =
            __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
        const index_t expert_stride = __builtin_amdgcn_readfirstlane(problem.N * problem.K);
        // N0, K0, Blocksize*KPack
        const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_grid + expert_id * expert_stride, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+        // if(blockIdx.x==1)
+        // printf("tid %d eid %d expert_stride %d bufsize %d\n",
+        // threadIdx.x, expert_id, expert_stride, b_grid_desc_bpreshuffled.GetElementSpaceSize());
        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
@@ -1151,7 +1162,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
        // A matrix blockwise copy
        auto a_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+            ThreadGroupTensorSliceTransfer_v4r1_mod8<ThisThreadBlock,
                                                AElementwiseOperation,
                                                ck::tensor_operation::element_wise::PassThrough,
                                                InMemoryDataOperationEnum::Set,
@@ -1450,7 +1461,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+            // printf("eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee\n");
            static_for<0, num_access, 1>{}([&](auto access_id) {
                // make sure it's safe to write to LDS
                block_sync_lds();

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -98,7 +98,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
        static_assert(SliceLengths::At(SrcVectorDim) % SrcScalarPerVector == 0,
                      "SliceLengths[SrcVectorDim] must be divisible by SrcScalarPerVector");
@@ -221,7 +220,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
            src_thread_scratch_tuple_(thread_scratch_id)
                .template SetAsType<dst_vector_t>(src_data_idx_seq,
                                                  op_r_v.template AsType<dst_vector_t>()[I0]);
+            // if(1) {
+            //     using print_vec_t = typename vector_type<DstData, 1>::type;
+            //     static_for<0, SrcScalarPerVector, 1>{}([&](auto idx) {
+            //         printf("tid %d %f\n",threadIdx.x, type_convert<float>(src_vector_container.template AsType<print_vec_t>()[idx]));
+            //     });
+            // }
            constexpr auto move_on_dim = [&]() constexpr
            {
                StaticallyIndexedArray<bool, nDim> move_on_dim_;
@@ -543,7 +548,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                dst_coord_.GetOffset(),
                is_dst_valid,
                dst_vector_container.template AsType<dst_vector_t>()[I0]);
+            // if(1) {
+            //     using print_vec_t = typename vector_type<DstData, 1>::type;
+            //     static_for<0, DstScalarPerVector, 1>{}([&](auto idx) {
+            //         printf("tid %d off %d valid %d val %f\n",threadIdx.x, dst_coord_.GetOffset(), is_dst_valid, type_convert<float>(dst_vector_container.template AsType<print_vec_t>()[idx]));
+            //     });
+            // }
            constexpr auto move_on_dim = [&]() constexpr
            {
                StaticallyIndexedArray<bool, nDim> move_on_dim_;

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp
@@ -47,6 +47,9 @@ template <typename SrcDatas,
 struct ThreadwiseTensorSliceTransfer_v7r3
 {
    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
    static constexpr auto SrcScalarPerVector = SrcScalarPerVectors{}[I0];
@@ -120,6 +123,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3
    {
        static_for<0, nDst, 1>{}([&](auto i) {
            dst_coords_(i) = make_tensor_coordinate(dst_descs[i], dst_slice_origin_idxs[i]);
+            // printf("tid %d origin %d %d %d %d off %d\n", threadIdx.x, dst_slice_origin_idxs[i][I0], dst_slice_origin_idxs[i][I1], dst_slice_origin_idxs[i][I2], dst_slice_origin_idxs[i][I3], dst_coords_(i).GetOffset());
        });
    }
@@ -419,6 +423,14 @@ struct ThreadwiseTensorSliceTransfer_v7r3
                    dst_coords_[i].GetOffset(),
                    is_dst_valid,
                    dst_vectors[i].template AsType<dst_vector_t>()[I0]);
+                if(1) {
+                    static_for<0, DstScalarPerVector, 1>{}([&](auto idx) {
+                        using DstData = remove_cvref_t<tuple_element_t<0, DstDatas>>;
+                        using print_vec_t = typename vector_type<DstData, 1>::type;
+                        // printf("tid %d off %d valid %d %f\n",threadIdx.x, dst_coords_[i].GetOffset(), is_dst_valid, 
+                        // type_convert<float>(dst_vectors[i].template AsType<print_vec_t>()[idx]));
+                    });
+                }
            });
            // move coordinate

--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -17,7 +17,7 @@ fi
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
+-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3  --save-temps -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \