Added moe_pk_i4_gemm2, function pass.

a0432459 · mtgu0705 · a09f038c · a0432459 · a0432459 · a0432459
Commit a0432459 authored Feb 15, 2025 by mtgu0705
6 changed files
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -6,3 +6,4 @@ add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_mul
 add_example_executable(example_moe_gemm1 moe_gemm1.cpp)
 add_example_executable(example_moe_gemm2 moe_gemm2.cpp)
 add_example_executable(example_moe_pk_i4_gemm1 moe_pk_i4_gemm1.cpp)
+add_example_executable(example_moe_pk_i4_gemm2 moe_pk_i4_gemm2.cpp)
--- a/example/65_gemm_multiply_multiply/moe_pk_i4_gemm1.cpp
+++ b/example/65_gemm_multiply_multiply/moe_pk_i4_gemm1.cpp
@@ -92,7 +92,7 @@ struct MulABScaleSilu
 using CDEElementOp = MulABScale;
 #if 1
-void preShuffleBuffer(const B0DataType* src, B0DataType* dst, int N, int K, int NXdl)
+void preShuffleBuffer(const I4* src, I4* dst, int N, int K, int NXdl)
 {
    int KPack = 32;
    int NLane = NXdl;
@@ -124,6 +124,7 @@ void preShuffleBuffer(const B0DataType* src, B0DataType* dst, int N, int K, int
 }
 #endif
+#if 0
 float i4_to_f32_gfx9(uint8_t i4)
 {
    static std::unordered_map<uint8_t, float> u = {{0b1000, -0.5000f},
@@ -145,6 +146,7 @@ float i4_to_f32_gfx9(uint8_t i4)
    return u[i4];
 }
+#endif
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

--- a/example/65_gemm_multiply_multiply/moe_pk_i4_gemm2.cpp
+++ b/example/65_gemm_multiply_multiply/moe_pk_i4_gemm2.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using I4  = ck::pk_i4_t;
+using F16 = ck::half_t;
+using F8  = ck::f8_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using A0DataType       = F8;
+using B0DataType       = I4;
+using EDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using D2DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType, D2DataType>;
+using A0Layout = Row;
+using B0Layout = Col;
+using ELayout  = Row;
+using D0Layout = Row;
+using D1Layout = Col;
+using D2Layout = ELayout;
+// using DsLayoutGate = ck::Tuple<D0Layout, D1Layout>;
+using DsLayout = ck::Tuple<D0Layout, D1Layout, D2Layout>;
+// d0: ascale, d1: bscale, d2:expert weight
+struct MulABScaleExpertWeight
+{
+    template <typename E, typename C, typename D0, typename D1, typename D2>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1, const D2& d2) const;
+    //real kernel use
+    template <>
+    __host__ __device__ constexpr void operator()<EDataType, float, float, float, float>
+                                                                           (EDataType& e,
+                                                                            const float& c,
+                                                                            const float& d0,
+                                                                            const float& d1,
+                                                                            const float& d2) const
+    {
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+    }
+    // for reference
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float, float>
+                                                                           (float& e,
+                                                                            const float& c,
+                                                                            const float& d0,
+                                                                            const float& d1,
+                                                                            const float& d2) const
+    {
+        e = ck::type_convert<EDataType>(c * d0 * d1 * d2);
+    }
+};
+using CDEElementOp = MulABScaleExpertWeight;
+void preShuffleBuffer(const I4* src, I4* dst, int N, int K, int NXdl)
+{
+    int KPack = 32;
+    int NLane = NXdl;
+    int KLane = 64 / NLane;
+    int K0 = K / (KLane * KPack);
+    // K -> K0 KLane KPack
+    // N -> N0 NLane
+    // N, K -> N0 K0 KLane NLane KPack
+    int tempk;
+    for(int n = 0; n < N; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0 = n / NLane;
+            int n1 = n % NLane;
+            int k0 = k / (KLane * KPack);
+            tempk  = k % (KLane * KPack);
+            int k1 = tempk / KPack;
+            int k2 = tempk % KPack;
+            int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
+                              k1 * KPack * NLane + n1 * KPack + k2;
+            dst[outputIndex / 2] = src[(n * K + k) / 2];
+        }
+    }
+}
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MulABScaleExpertWeight;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr ck::index_t MPerBlock = 64;
+static constexpr ck::index_t BLOCKSIZE = 256;
+static constexpr ck::index_t NPerBlock = 128;
+static constexpr ck::index_t MNPerXDL = 32;
+static constexpr ck::index_t KPerBlock = 256 / sizeof(A0DataType);
+static constexpr ck::index_t MXDLPerWave = MPerBlock / 32; //todo fix this constraint
+static constexpr ck::index_t CShuffleMXDLPerWave = MPerBlock / 32;
+static constexpr ck::index_t CShuffleNLane = NPerBlock / 2;
+static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
+static constexpr ck::index_t AK1 = 16 / sizeof(A0DataType);
+static constexpr ck::index_t BK1 = 16 / sizeof(B0DataType);
+static constexpr ck::index_t EVec = 2;
+static constexpr ck::index_t D0Vec = 1;
+static constexpr ck::index_t D1Vec = 1;
+static constexpr ck::index_t D2Vec = 1;
+using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
+    // clang-format off
+        <      Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   
+               BLOCKSIZE,   MPerBlock,   NPerBlock,    KPerBlock,
+               AK1,   BK1,
+               MNPerXDL,   MNPerXDL,
+               MXDLPerWave,    1,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0,
+               CShuffleMXDLPerWave,    1,   S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, false, A0DataType>;
+// clang-format on
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+// tokens = 1
+// topk = 1
+// experts = 8
+// per expert: 
+    // GEMM shape
+    ck::index_t N = 6144;
+    ck::index_t K = 8192;
+    ck::index_t experts = 8;
+    ck::index_t sorted_tile_num = 8;
+    ck::index_t sorted_tile_size = MPerBlock;
+    ck::index_t SORTED_SIZE = sorted_tile_num * sorted_tile_size;
+    ck::index_t tokens = 64;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 6)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        N = std::stoi(argv[4]);
+        K = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf(
+            "arg4 to 5: N, K\n");
+        exit(0);
+    }
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideE = N;
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+    constexpr auto StrideDs = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+    ck::index_t KBatch = 1;
+    // const ck::index_t experts = 8;
+    Tensor<ck::index_t> expert_ids(HostTensorDescriptor({experts}, {1}));
+    Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({SORTED_SIZE}, {1}));
+    for (int i = 0; i < sorted_tile_num; i++) {
+        expert_ids.mData[i] = i;
+    }
+    int token_per_tile = tokens / sorted_tile_num;
+    int tokenid = 0;
+    // sorted_token_ids.mData[0] = 0;
+    for (int i = 0; i < SORTED_SIZE; i++) {
+        int tile_off = i % sorted_tile_size;
+        if(tile_off < token_per_tile)
+            sorted_token_ids.mData[i] = tokenid++;
+        else
+            sorted_token_ids.mData[i] = tokens;
+    }
+    Tensor<A0DataType> a0_m_k(HostTensorDescriptor({SORTED_SIZE, K}, {K, 1}));
+    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N*K, 1, K}));
+    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N*K, 1, K}));
+    Tensor<D0DataType> d0_m_n(HostTensorDescriptor({SORTED_SIZE, N}, {StrideDs[0], 0}));
+    Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
+    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({SORTED_SIZE, N}, {1, 0}));
+    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
+    e_t_n_device_result.SetZero();
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "b0_e_n_k: " << b0_e_n_k.mDesc << std::endl;
+    std::cout << "d2_e_n: " << d2_e_n.mDesc << std::endl;
+    std::cout << "d1_e_n: " << d1_e_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "e_t_n: " << e_t_n_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{0, 2});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 2});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_2<D2DataType>{-2, 2});
+        break;
+    case 2:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_e_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        d2_e_n.GenerateTensorValue(GeneratorTensor_3<D2DataType>{0.0, 1.0});
+    }
+    DeviceMem sorted_token_ids_dev(sizeof(ck::index_t) * sorted_token_ids.mDesc.GetElementSpaceSize());
+    DeviceMem expert_ids_dev(sizeof(ck::index_t) * expert_ids.mDesc.GetElementSpaceSize());
+    DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_e_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem d2_device_buf(sizeof(D2DataType) * d2_e_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_t_n_device_result.mDesc.GetElementSpaceSize());
+    a0_m_k.savetxt("a.txt");
+    expert_ids.savetxt("expert_ids.txt", "int");
+    sorted_token_ids.savetxt("sorted_token_ids.txt", "int");
+    d0_m_n.savetxt("d0_m_n.txt", "int");
+    d1_e_n.savetxt("d1_e_n.txt", "int");
+    d2_e_n.savetxt("d2_e_n.txt", "int");
+    sorted_token_ids_dev.ToDevice(sorted_token_ids.mData.data());
+    expert_ids_dev.ToDevice(expert_ids.mData.data());
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_e_n.mData.data());
+    d2_device_buf.ToDevice(d2_e_n.mData.data());
+    e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    preShuffleBuffer(b0_e_n_k.mData.data(), b0_preshuffled.mData.data(), N * experts, K, device_op.GetPreShuffleParameters());
+    // vector pk_i4x4 permute
+    for(int e = 0; e < experts; e++)
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int j = 0; j < K; j += 8)
+            {
+                int input[8];
+                for(int k = 0; k < 4; k++)
+                {
+                    int i4x2         = b0_preshuffled(e, j + k * 2, i).data;
+                    input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                    input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+                }
+                // permute 01234567->20643175
+                {
+                    int hi   = input[2];
+                    int lo   = input[0];
+                    int i4x2 = (hi << 4) | lo;
+                    b0_preshuffled(e, j + 0, i) = i4x2;
+                }
+                {
+                    int hi   = input[6];
+                    int lo   = input[4];
+                    int i4x2 = (hi << 4) | lo;
+                    b0_preshuffled(e, j + 2, i) = i4x2;
+                }
+                {
+                    int hi   = input[3];
+                    int lo   = input[1];
+                    int i4x2 = (hi << 4) | lo;
+                    b0_preshuffled(e, j + 4, i) = i4x2;
+                }
+                {
+                    int hi   = input[7];
+                    int lo   = input[5];
+                    int i4x2 = (hi << 4) | lo;
+                    b0_preshuffled(e, j + 6, i) = i4x2;
+                }
+            }
+        }
+    }
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+    auto invoker = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(sorted_token_ids_dev.GetDeviceBuffer(),
+                                expert_ids_dev.GetDeviceBuffer(),
+                                a0_device_buf.GetDeviceBuffer(),
+                               b0_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, NumDTensor>{d0_device_buf.GetDeviceBuffer(),
+                                                                   d1_device_buf.GetDeviceBuffer(),
+                                                                   d2_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               tokens,
+                               SORTED_SIZE,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               StrideDs,
+                               StrideE,
+                               KBatch,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    if (time_kernel) {
+        // not result correct here because output buf not setzero
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+        std::size_t flop = std::size_t(2) * SORTED_SIZE * N * K;
+        std::size_t num_btype =
+            sizeof(A0DataType) * SORTED_SIZE * K + sizeof(B0DataType) * K * N * experts + sizeof(EDataType) * SORTED_SIZE * N;
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s" << device_op.GetTypeString() << std::endl;
+    }
+    if(do_verification)
+    {
+        //gemm2 use atomic, so need to reinit outputs
+        e_device_buf.ToDevice(e_t_n_device_result.mData.data());
+        invoker.Run(argument, StreamConfig{nullptr, false, 0 ,0,1});
+        Tensor<CShuffleDataType> c_t_n({tokens, N});
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceMoeGemm2<A0DataType,
+                                                          B0DataType,
+                                                          D0DataType,
+                                                          D1DataType,
+                                                          D2DataType,
+                                                          CShuffleDataType,
+                                                          AccDataType,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          CDEElementOp>;
+        auto ref_moe_gemm           = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_moe_gemm.MakeInvoker();
+        auto ref_argument = ref_moe_gemm.MakeArgument(
+           sorted_token_ids, expert_ids, sorted_tile_size, a0_m_k, b0_e_n_k, d0_m_n, d1_e_n, d2_e_n, c_t_n, PassThrough{}, PassThrough{}, cde_element_op);
+        ref_invoker.Run(ref_argument);
+        for(int t = 0; t < tokens; ++t)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                e_t_n_host_result(t, n) = ck::type_convert<EDataType>(c_t_n(t, n));
+            }
+        }
+        e_device_buf.FromDevice(e_t_n_device_result.mData.data());
+        e_t_n_device_result.savetxt("out.txt");
+        e_t_n_host_result.savetxt("ref.txt");
+        return ck::utils::check_err(
+                   e_t_n_device_result, e_t_n_host_result, "Error: Incorrect results!", 1e-3, 5e-2)
+                   ? 0
+                   : 1;
+    }
+    return 0;
+}
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_scatter.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_scatter.hpp
@@ -194,6 +194,20 @@ struct GridwiseMoeGemmScatter
    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+    static constexpr index_t APackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<ADataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
+    static constexpr index_t BPackedSize = []() {
+        if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+            return 2;
+        else
+            return 1;
+    }();
    __host__ static auto CalculateGridSize(index_t M, index_t N)
    {
        return std::make_tuple(math::integer_divide_ceil(N, NPerBlock),
@@ -381,6 +395,10 @@ struct GridwiseMoeGemmScatter
        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
+        static_assert(!(is_same_v<remove_cvref_t<ADataType>, pk_i4_t> &&
+                        GemmSpec != GemmSpecialization::Default),
+                      "pk_i4_t does not support padding");
        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
                     GemmSpec == GemmSpecialization::MNKPadding)
        {
@@ -670,7 +688,7 @@ struct GridwiseMoeGemmScatter
        {
            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
            {
-                a_k_split_offset = k_id * karg.KRead;
+                a_k_split_offset = k_id * karg.KRead / APackedSize;
            }
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
            {
@@ -684,7 +702,7 @@ struct GridwiseMoeGemmScatter
            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
            {
                // KPack * NLane * KLane * K0 * N0
-                b_k_split_offset = k_id * karg.KRead * NLane;
+                b_k_split_offset = k_id * karg.KRead * NLane / BPackedSize;
            }
            if(k_id < karg.KBatch - 1)
@@ -714,7 +732,7 @@ struct GridwiseMoeGemmScatter
        // in some cases.
        else if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
        {
-            constexpr auto MLdsLayer        = 32 * 4 / KPerBlock / sizeof(LDSTypeA) < 1
+            constexpr auto MLdsLayer        = 32 * 4 / KPerBlock / sizeof(LDSTypeA) / APackedSize < 1
                                                  ? 1
                                                  : 32 * 4 / KPerBlock / sizeof(LDSTypeA);
            constexpr auto a_lds_block_desc = make_naive_tensor_descriptor(
@@ -864,8 +882,8 @@ struct GridwiseMoeGemmScatter
                                BlkGemmPipelineVer,
                                BlkGemmPipeSched,
                                BlockSize,
-                                LDSTypeA,
+                                ADataType,
-                                LDSTypeB,
+                                BDataType,
                                ComputeTypeA,
                                AccDataType,
                                decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()),
@@ -1158,7 +1176,7 @@ struct GridwiseMoeGemmScatter
        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid + expert_id * expert_stride, b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            p_b_grid + expert_id * expert_stride / BPackedSize, b_grid_desc_bpreshuffled.GetElementSpaceSize());
        // if(threadIdx.x==0)
        // printf("tid %d eid %d expert_stride %d bufsize %d\n",
        // threadIdx.x, expert_id, expert_stride, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
@@ -1211,7 +1229,8 @@ struct GridwiseMoeGemmScatter
            decltype(b_grid_desc_bpreshuffled),
            decltype(b_block_desc_bk0_n_bk1),
            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
-            Sequence<0, 1, 2, 3>,
+            // Sequence<0, 1, 2, 3>,
+            Sequence<1, 2, 0, 3>,
            3,
            BBlockTransferSrcScalarPerVector,
            BThreadTransferSrcResetCoordinateAfterRun,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
@@ -110,7 +110,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
                        }
                        else
                        {
-                            arg.b_element_op_(v_b, arg.b_e_n_k_(e, n, k));
+                            arg.b_element_op_(v_b, arg.b_e_n_k_(e, k, n));
                        }
                        v_acc +=

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
@@ -98,21 +98,29 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
                if(t < token_cnt) {
                    for(int k = 0; k < K; ++k)
                    {
-                        // use PassThrough instead of ConvertBF16RTN for reference calculation
+                        if constexpr(is_same_v<ADataType, pk_i4_t>)
-                        if constexpr(is_same_v<AElementwiseOperation,
-                                            ck::tensor_operation::element_wise::ConvertBF16RTN>)
                        {
-                            ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_t_k_k_(t, topk_id, k));
+                            uint8_t i4x2 = arg.a_t_k_(m, k).data;
+                            uint8_t i4   = 0;
+                            if(k % 2 == 1)
+                                i4 = (i4x2 >> 0) & 0xf;
+                            else
+                                i4 = (i4x2 >> 4) & 0xf;
+                            v_a = i4_to_f32_gfx9(i4);
                        }
                        else
                        {
                            arg.a_element_op_(v_a, arg.a_t_k_k_(t, topk_id, k));
                        }
-                        // same for B matrix
+                        if constexpr(is_same_v<BDataType, pk_i4_t>)
-                        if constexpr(is_same_v<BElementwiseOperation,
-                                            ck::tensor_operation::element_wise::ConvertBF16RTN>)
                        {
-                            ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_e_n_k_(e, n, k));
+                            uint8_t i4x2 = arg.b_e_n_k_(e, k, n).data;
+                            uint8_t i4   = 0;
+                            if(k % 2 == 1)
+                                i4 = (i4x2 >> 0) & 0xf;
+                            else
+                                i4 = (i4x2 >> 4) & 0xf;
+                            v_b = i4_to_f32_gfx9(i4);
                        }
                        else
                        {
@@ -189,6 +197,28 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
        return str.str();
    }
+    static float i4_to_f32_gfx9(uint8_t i4)
+    {
+        static std::unordered_map<uint8_t, float> u = {{0b1000, -0.5000f},
+                                                       {0b1001, -0.4375f},
+                                                       {0b1010, -0.3750f},
+                                                       {0b1011, -0.3125f},
+                                                       {0b1100, -0.2500f},
+                                                       {0b1101, -0.1875f},
+                                                       {0b1110, -0.1250f},
+                                                       {0b1111, -0.0625f},
+                                                       {0b0, +0.0000f},
+                                                       {0b1, +0.0625f},
+                                                       {0b10, +0.1250f},
+                                                       {0b11, +0.1875f},
+                                                       {0b100, +0.2500f},
+                                                       {0b101, +0.3125f},
+                                                       {0b110, +0.3750f},
+                                                       {0b111, +0.4375f}};
+        return u[i4];
+    }
 };
 } // namespace host