fix mtile 64,128 for gemm1

568ad1e1 · coderfeli · 59f3e009 · 568ad1e1 · 568ad1e1
Commit 568ad1e1 authored Feb 12, 2025 by coderfeli
Showing with 5 additions and 4 deletions

example/65_gemm_multiply_multiply/moe_gemm1.cpp example/65_gemm_multiply_multiply/moe_gemm1.cpp +4 -3

example/65_gemm_multiply_multiply/moe_gemm2.cpp example/65_gemm_multiply_multiply/moe_gemm2.cpp +1 -1

No files found.
--- a/example/65_gemm_multiply_multiply/moe_gemm1.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm1.cpp
@@ -132,8 +132,9 @@ using AElementOp   = PassThrough;
 using BElementOp   = PassThrough;

 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
-static constexpr ck::index_t MPerBlock = 32;
+static constexpr ck::index_t MPerBlock = 128;
 static constexpr ck::index_t MNPerXDL = 32;
+static constexpr ck::index_t CShuffleMXDLPerWave = MPerBlock / 32;
 static constexpr ck::index_t KPerBlock = 256 / sizeof(A0DataType);
 static constexpr ck::index_t MXDLPerWave = MPerBlock / 32; //todo fix this constraint
 static constexpr ck::index_t AK1 = 16 / sizeof(A0DataType);
@@ -170,7 +171,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm
               //    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
               //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-               1,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
+               CShuffleMXDLPerWave,    1,   S<1, 32, 1, 8>, S<EVec, D0Vec, D1Vec>,
               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, true, A0DataType>;
        // kernel 2: 128->32x128x128
        //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;
@@ -194,7 +195,7 @@ int main(int argc, char* argv[])
    ck::index_t sorted_tile_num = 8;
    ck::index_t sorted_tile_size = MPerBlock;
    ck::index_t SORTED_SIZE = sorted_tile_num * sorted_tile_size;
-    ck::index_t tokens = 32;
+    ck::index_t tokens = 128;

    if(argc == 1)
    {

--- a/example/65_gemm_multiply_multiply/moe_gemm2.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2.cpp
@@ -121,7 +121,7 @@ using BElementOp   = PassThrough;
 using CDEElementOp = MulABScaleExpertWeight;

 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
-static constexpr ck::index_t MPerBlock = 32;
+static constexpr ck::index_t MPerBlock = 64;
 static constexpr ck::index_t MNPerXDL = 32;
 static constexpr ck::index_t KPerBlock = 256 / sizeof(A0DataType);
 static constexpr ck::index_t MXDLPerWave = MPerBlock / 32; //todo fix this constraint