Add new instance (prepare to apply optimization)

ff7d21b1 · Po-Yen, Chen · 5a98e0e0 · ff7d21b1 · ff7d21b1 · ff7d21b1
Commit ff7d21b1 authored Jul 20, 2023 by Po-Yen, Chen
3 changed files
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16/CMakeLists.txt
@@ -15,6 +15,7 @@ set(INSTANCE_FILES
   ${CURRENT_DIR}/mk_kn_mn_add_instance.cpp
   ${CURRENT_DIR}/mk_nk_mn_1_stage_default_pipeline_v1_instance.cpp
   ${CURRENT_DIR}/mk_nk_mn_1_stage_default_pipeline_v2_instance.cpp
+   ${CURRENT_DIR}/mk_nk_mn_1_stage_default_pipeline_v2_opt_instance.cpp
   ${CURRENT_DIR}/mk_nk_mn_1_stage_interwave_pipeline_v1_instance.cpp
   ${CURRENT_DIR}/mk_nk_mn_2_stage_default_pipeline_v1_instance.cpp
   ${CURRENT_DIR}/mk_nk_mn_2_stage_default_pipeline_v2_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16/mk_nk_mn_1_stage_default_pipeline_v2_opt_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16/mk_nk_mn_1_stage_default_pipeline_v2_opt_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using Instances = std::tuple<
+// clang-format off
+        // pipeline v2, 1 wave
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_1_stage_default_pipeline_v2_opt_instances(
+    OwnerList<InstanceTN>& instances)
+{
+    add_device_operation_instances(instances, Instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16/mk_nk_mn_add_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16/mk_nk_mn_add_instance.cpp
@@ -14,6 +14,8 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_1_stage_default_pipeline
    Instances&);
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_1_stage_default_pipeline_v2_instances(
    Instances&);
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_1_stage_default_pipeline_v2_opt_instances(
+    Instances&);
 void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_1_stage_interwave_pipeline_v1_instances(
    Instances&);

@@ -30,6 +32,8 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(Instances& ins
        instances);
    add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_1_stage_default_pipeline_v2_instances(
        instances);
+    add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_1_stage_default_pipeline_v2_opt_instances(
+        instances);
    add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_1_stage_interwave_pipeline_v1_instances(
        instances);