2x2 ok

4404984a · coderfeli · f64b1375 · 4404984a · 4404984a · 4404984a
Commit 4404984a authored Feb 17, 2025 by coderfeli
3 changed files
--- a/example/65_gemm_multiply_multiply/moe_gemm1.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1.cpp
@@ -164,7 +164,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
               // mn_perxdl
               MNPerXDL,   MNPerXDL,
               // mn_xdlperwave 
-               MXDLPerWave,    1,
+               2,    2,
               // a,b: loadtranfer cluster, cluster order, srcorder,VECDIM, srcpervec, dstpervec, lds_extra
            //    S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
            //    S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 0,
@@ -173,7 +173,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
               //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
               //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-               CShuffleMXDLPerWave,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
+               2,    2,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, true, A0DataType>;
        // kernel 2: 128->32x128x128
        //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_gather.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm_gather.hpp
@@ -175,7 +175,6 @@ struct GridwiseMoeGemmGather
    static constexpr index_t KRepeat = KPerBlock / KLane / KPack;
    static constexpr index_t NLane   = NPerXdl;
    static constexpr index_t NWave   = NPerBlock / NPerXdl / NXdlPerWave;
-    static_assert(NWave * warpSize == BlockSize);
    // static constexpr index_t NumTokens = 1;
    static constexpr index_t SortedTileSize   = MPerBlock;
@@ -1247,13 +1246,13 @@ struct GridwiseMoeGemmGather
            decltype(b_grid_desc_bpreshuffled),
            decltype(b_block_desc_bk0_n_bk1),
            Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
-            Sequence<0, 1, 2, 3>,
+            Sequence<1, 2, 0, 3>,
            3,
            BBlockTransferSrcScalarPerVector,
            BThreadTransferSrcResetCoordinateAfterRun,
            true>(b_grid_desc_bpreshuffled,
                  make_multi_index(n_block_data_idx_on_grid,
-                                   get_warp_local_1d_id(),
+                    get_warp_local_1d_id() % NWave,
                                   0,
                                   KPack * (get_thread_local_1d_id() % warpSize)));