add tunning parameters

cca0ceee · ltqin · 1b9e6e11 · cca0ceee · cca0ceee · cca0ceee
Commit cca0ceee authored Dec 30, 2021 by ltqin
6 changed files
--- a/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_kn_mn.cpp
@@ -22,7 +22,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-/*using device_gemm_xdl_instance_f32_f32_f32_km_kn_mn = std::tuple<
+using device_gemm_xdl_instance_f32_f32_f32_km_kn_mn = std::tuple<
    // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
@@ -38,12 +38,12 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row,  PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,      S<1, 1, 1, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,              2,              1,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 720>
    // clang-format on
    >;
-*/
 template <>
 void add_device_splitk_gemm_instance<F32, F32, F32, Col, Row, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& )
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
 {
- /*   using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_km_kn_mn;
+    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_km_kn_mn;
    const auto device_gemms = DeviceGemms{};
@@ -53,7 +53,7 @@ void add_device_splitk_gemm_instance<F32, F32, F32, Col, Row, Row>(
        auto gemm = Gemm{};
        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });*/
+    });
 }
 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_nk_mn.cpp
+++ b/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_nk_mn.cpp
@@ -22,7 +22,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-/*using device_gemm_xdl_instance_f32_f32_f32_km_nk_mn = std::tuple<
+using device_gemm_xdl_instance_f32_f32_f32_km_nk_mn = std::tuple<
    // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
@@ -38,12 +38,12 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,      S<1, 1, 1, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2> ,     S<0, 1, 3, 2> ,              2,              1,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,               7,               1,      true,      true, 720>
    // clang-format on
    >;
-*/
 template <>
 void add_device_splitk_gemm_instance<F32, F32, F32, Col, Col, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& )
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
 {
-  /*  using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_km_nk_mn;
+    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_km_nk_mn;
    const auto device_gemms = DeviceGemms{};
@@ -53,7 +53,7 @@ void add_device_splitk_gemm_instance<F32, F32, F32, Col, Col, Row>(
        auto gemm = Gemm{};
        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });*/
+    });
 }
 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_kn_mn.cpp
@@ -28,15 +28,15 @@ using device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn = std::tuple<
        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
        //##########|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   96,   128,     4,  8,   16,   16,    3,    4,      S<1, 1, 3, 4>,     S<1, 4, 32, 2>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 8>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 720>
+        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   96,   128,     4,  8,   16,   16,    3,    4,      S<1, 1, 3, 4>,     S<1, 4, 32, 2>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 8>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 720>,
-     /*   DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,      S<1, 1, 4, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 720>,
+        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,      S<1, 1, 4, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 720>,
        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 4, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              4,              4,               7,               1,      true,      true, 720>,
        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,      S<1, 1, 4, 4>,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 4, 4>,     S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              4,              4,               7,               1,      true,      true, 720>,
        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 720>,
        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,      S<1, 1, 4, 4>,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 720>,
        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,      S<1, 1, 2, 4>,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 4, 4>,     S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              4,              4,               7,               1,      true,      true, 720>,
        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 1, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              1,              4,               7,               1,      true,      true, 720>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,      S<1, 1, 1, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 720>*/
+        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,      S<1, 1, 1, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 720>
    >;
 template <>

--- a/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_nk_mn.cpp
+++ b/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_nk_mn.cpp
@@ -22,7 +22,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-/*using device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn = std::tuple<
+using device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn = std::tuple<
    // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
@@ -43,12 +43,12 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  4,   32,   32,    1,    2,      S<1, 1, 2, 4>,     S<1, 4, 16, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 4, 4>,     S<1, 4, 16, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,               7,               1,      true,      true, 720>
    // clang-format on
    >;
-*/
 template <>
 void add_device_splitk_gemm_instance<F32, F32, F32, Row, Col, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& )
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
 {
- /*   using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn;
+    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn;
    const auto device_gemms = DeviceGemms{};
@@ -58,7 +58,7 @@ void add_device_splitk_gemm_instance<F32, F32, F32, Row, Col, Row>(
        auto gemm = Gemm{};
        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });*/
+    });
 }
 } // namespace device_gemm_instance

--- a/device_operation/include/device_gemm_instance.hpp
+++ b/device_operation/include/device_gemm_instance.hpp
@@ -9,6 +9,10 @@ namespace tensor_operation {
 namespace device {
 namespace device_gemm_instance {
+using DeviceGemmNoOpPtr = DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough>;
 template <typename ADataType,
          typename BDataType,
          typename CDataType,

--- a/device_operation/include/device_gemm_xdl_instance.hpp
+++ b/device_operation/include/device_gemm_xdl_instance.hpp
@@ -5,10 +5,6 @@ namespace tensor_operation {
 namespace device {
 namespace device_gemm_instance {
-using DeviceGemmNoOpPtr = DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::PassThrough>;
 template <>
 void add_device_splitk_gemm_instance<float,
                              float,