Update tuning parameter & compilation options of DeviceGemmXdl<> instance (layout=TT) (#819)

* Enable pipeline v2 opt for layout=TT instance * Use better thread mapping for reading A tile * Conditionally enable pipeline v2 opt * Allow enabling only fp16 gemm instances in profiler * Fix formatting error * Fix compilation error if we enable fp32 in profiler

Update tuning parameter & compilation options of DeviceGemmXdl<> instance (layout=TT) (#819)
* Enable pipeline v2 opt for layout=TT instance * Use better thread mapping for reading A tile * Conditionally enable pipeline v2 opt * Allow enabling only fp16 gemm instances in profiler * Fix formatting error * Fix compilation error if we enable fp32 in profiler
f7cc8c3b · Po Yen Chen · GitHub · 7761e523 · f7cc8c3b · f7cc8c3b
Unverified Commit f7cc8c3b authored Aug 02, 2023 by Po Yen Chen Committed by GitHub Aug 02, 2023
3 changed files
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -95,36 +95,39 @@ endif()

 add_instance_library(device_gemm_instance ${GEMM_INSTANCES})

-set(ENABLE_PIPELINE_V2_OPT OFF)

-if (ENABLE_PIPELINE_V2_OPT)
-set(MAX_ILP_OPTS
-   -mllvm
-   -amdgpu-enable-max-ilp-scheduling-strategy
-)
-set(WAVES_PER_EU_DEFS
-   CK_USE_WAVES_PER_EU=1
-   CK_MIN_WAVES_PER_EU=1
-   CK_MAX_WAVES_PER_EU=1
-)
-set(IGLP_OPT_DEFS
-   CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1
-)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  set(ENABLE_PIPELINE_V2_OPT OFF)
+
+  if (ENABLE_PIPELINE_V2_OPT)
+    set(MAX_ILP_OPTS
+       -mllvm
+       -amdgpu-enable-max-ilp-scheduling-strategy
+    )
+    set(WAVES_PER_EU_DEFS
+       CK_USE_WAVES_PER_EU=1
+       CK_MIN_WAVES_PER_EU=1
+       CK_MAX_WAVES_PER_EU=1
+    )
+    set(IGLP_OPT_DEFS
+       CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1
+    )

-# layout=NT
-set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
-   COMPILE_OPTIONS ";;"
-   COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
-# layout=NN
-set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
-   COMPILE_OPTIONS "${MAX_ILP_OPTS}"
-   COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
-# layout=TT
-set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
-   COMPILE_OPTIONS "${MAX_ILP_OPTS}"
-   COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS}")
-# layout=TN
-set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
-   COMPILE_OPTIONS "${MAX_ILP_OPTS}"
-   COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
-endif(ENABLE_PIPELINE_V2_OPT)
+    # layout=NT
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS ";;"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+    # layout=NN
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS "${MAX_ILP_OPTS}"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+    # layout=TT
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS ";;"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+    # layout=TN
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS "${MAX_ILP_OPTS}"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+  endif(ENABLE_PIPELINE_V2_OPT)
+endif(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp
@@ -18,7 +18,7 @@ using Instances =
        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                      |
        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                      |
        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                      |
-        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     8,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,  PipelineVersion::v2>
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     8,  8,   32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,  PipelineVersion::v2>
 #endif
        // clang-format on
        >;

--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -121,7 +121,10 @@ int profile_gemm(int argc, char* argv[])
        return pass ? 0 : 1;
    };

-    if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    if(false)
+        ;
+#ifdef __fp32__
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(Row{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{});
    }
@@ -137,6 +140,8 @@ int profile_gemm(int argc, char* argv[])
    {
        return profile(Col{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{});
    }
+#endif
+#ifdef __fp16__
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(Row{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{});
@@ -153,6 +158,7 @@ int profile_gemm(int argc, char* argv[])
    {
        return profile(Col{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{});
    }
+#endif
 #ifdef __bf16__
    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
    {