add optimization for small gemm

d2640676 · Wenkai · 840a617d · d2640676 · d2640676 · d2640676
Commit d2640676 authored Jul 10, 2022 by Wenkai
4 changed files
--- a/example/01_gemm/gemm_xdl_fp16_splitk.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_splitk.cpp
@@ -11,7 +11,7 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "device_gemm_xdl_splitk_c_shuffle_small_gemm.hpp"
 #include "device_gemm_xdl_splitk_c_shuffle_static.hpp"
 #include "device_gemm_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
@@ -47,25 +47,34 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 #if USEING_STATIC_KERNEL
+#if MNKB_1_4
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffleStatic
 //######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 //######|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-            //<Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        3,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
+            <Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
-            <Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        2,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
+#else
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffleStatic
+//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+            <Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        3,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
+            //<Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        2,   256,    16,   256,    32,   8,   2,   16,   16,    1,    4,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
            //<Row,      Col,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,    16,   128,    128,   8,   8,   16,   16,    1,    2,  S<1, 16, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 16, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           2,              S<1, 4, 1, 64>,               2>;
            //<Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        4,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
+#endif
 #else
 // clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffleSmallGemm
 //######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
 //######|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 //######|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 //######|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
            <Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        3,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           1,              S<1, 16, 1, 16>,               2>;
-            //<Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        4,   256,    16,   128,    32,   8,   2,   16,   16,    1,    2,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 8, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           1,              S<1, 16, 1, 16>,               2>;
+            //<Row,      Row,     Row,   F16,   F16,   F16,     F32,      F16,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        3,   256,    16,   256,    32,   8,   2,   16,   16,    1,    4,  S<1, 4, 16, 4>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              2,              2,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              2,         8,           1,           2,              S<1, 4, 1, 64>,               2>;
 // clang-format on

--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle_small_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle_small_gemm.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
@@ -14,12 +14,12 @@ struct GridwiseGemmPipeline_v2<1>
    __host__ __device__ static constexpr bool IsSupported(index_t num_loop)
    {
        // TODO: improve applicability
-        return num_loop > 2;
+        return num_loop >= 2;
    }
    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
    {
-        return (num_loop / 2) > 1;
+        return num_loop > 2;
    }
    template <bool HasMainLoop,

--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -48,31 +48,44 @@
 #include "amd_xdlops.hpp"
 #endif
-#define USEING_STATIC_KERNEL 1
+#define USEING_STATIC_KERNEL 0
-#define MNKB_16_1152_5120_8 0
+#define MNKB_0_8 1
-#define MNKB_16_5120_384_3 1
+#define MNKB_1_4 0
-#define MNKB_16_1280_5120_8 1
+#define MNKB_2_8 0
-#define MNKB_16_5120_1280_5 1
+#define MNKB_3_5 0
-#if MNKB_16_1152_5120_8
+#define MNKB_4_5 0
+#define MNKB_5_5 0
+#if MNKB_0_8
+#define M_matrix 16
+#define N_matrix 4096
+#define K_matrix 12800
+#define K_batch 5
+#elif MNKB_1_4
+#define M_matrix 16
+#define N_matrix 4096
+#define K_matrix 12800
+#define K_batch 5
+#elif MNKB_2_8
 #define M_matrix 16
-#define N_matrix 1152
+#define N_matrix 4096
-#define K_matrix 5120
+#define K_matrix 12800
-#define K_batch 8
+#define K_batch 5
-#elif MNKB_16_5120_384_3
+#elif MNKB_3_5
 #define M_matrix 16
-#define N_matrix 5120
+#define N_matrix 4096
-#define K_matrix 384
+#define K_matrix 12800
-#define K_batch 4
+#define K_batch 5
-#elif MNKB_16_1280_5120_8
+#elif MNKB_4_5
 #define M_matrix 16
-#define N_matrix 1280
+#define N_matrix 4096
-#define K_matrix 5120
+#define K_matrix 12800
-#define K_batch 8
+#define K_batch 5
-#elif MNKB_16_5120_1280_5
+#elif MNKB_5_5
 #define M_matrix 16
-#define N_matrix 5120
+#define N_matrix 4096
-#define K_matrix 1280
+#define K_matrix 12800
 #define K_batch 5
 #endif