clean up

8160c31a · Chao Liu · 0e67221f · 8160c31a · 8160c31a · 8160c31a
Commit 8160c31a authored Jan 28, 2022 by Chao Liu
20 changed files
--- a/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_kn_mn.cpp
 #include <stdlib.h>
 #include "config.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_instance.hpp"
 #include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -21,7 +21,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using device_gemm_xdl_instance_f16_f16_f16_km_kn_mn =
+using device_gemm_xdl_f16_f16_f16_km_kn_mn_instances =
    std::tuple<
        // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
@@ -39,21 +39,10 @@ using device_gemm_xdl_instance_f16_f16_f16_km_kn_mn =
        // clang-format on
        >;

-template <>
-void add_device_gemm_instance<F16, F16, F16, Col, Row, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
+void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f16_f16_f16_km_kn_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
+    add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_km_kn_mn_instances{});
 }

 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_nk_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_nk_mn.cpp
 #include <stdlib.h>
 #include "config.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_instance.hpp"
 #include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -21,7 +21,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using device_gemm_xdl_instance_f16_f16_f16_km_nk_mn =
+using device_gemm_xdl_f16_f16_f16_km_nk_mn_instances =
    std::tuple<
        // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
@@ -39,21 +39,10 @@ using device_gemm_xdl_instance_f16_f16_f16_km_nk_mn =
        // clang-format on
        >;

-template <>
-void add_device_gemm_instance<F16, F16, F16, Col, Col, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
+void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f16_f16_f16_km_nk_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
+    add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_km_nk_mn_instances{});
 }

 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_instance_f16_f16_f16_mk_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f16_f16_f16_mk_kn_mn.cpp
 #include <stdlib.h>
 #include "config.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_instance.hpp"
 #include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -21,7 +21,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using device_gemm_xdl_instance_f16_f16_f16_mk_kn_mn =
+using device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances =
    std::tuple<
        // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
@@ -39,21 +39,10 @@ using device_gemm_xdl_instance_f16_f16_f16_mk_kn_mn =
        // clang-format on
        >;

-template <>
-void add_device_gemm_instance<F16, F16, F16, Row, Row, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
+void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f16_f16_f16_mk_kn_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
+    add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances{});
 }

 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_instance_f16_f16_f16_mk_nk_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f16_f16_f16_mk_nk_mn.cpp
 #include <stdlib.h>
 #include "config.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_instance.hpp"
 #include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -21,7 +21,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-using device_gemm_xdl_instance_f16_f16_f16_mk_nk_mn =
+using device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances =
    std::tuple<
        // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
@@ -44,21 +44,10 @@ using device_gemm_xdl_instance_f16_f16_f16_mk_nk_mn =
        // clang-format on
        >;

-template <>
-void add_device_gemm_instance<F16, F16, F16, Row, Col, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
+void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f16_f16_f16_mk_nk_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
+    add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances{});
 }

 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_instance_f32_f32_f32_km_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f32_f32_f32_km_kn_mn.cpp
 #include <stdlib.h>
 #include "config.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_splitk_xdl.hpp"
-#include "device_gemm_instance.hpp"
 #include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -22,7 +21,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using device_gemm_xdl_instance_f32_f32_f32_km_kn_mn =
+using device_gemm_xdl_f32_f32_f32_km_kn_mn_instances =
    std::tuple<
        // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
@@ -40,21 +39,10 @@ using device_gemm_xdl_instance_f32_f32_f32_km_kn_mn =
        // clang-format on
        >;

-template <>
-void add_device_gemm_instance<F32, F32, F32, Col, Row, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
+void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_km_kn_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
+    add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_km_kn_mn_instances{});
 }

 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_instance_f32_f32_f32_km_nk_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f32_f32_f32_km_nk_mn.cpp
 #include <stdlib.h>
 #include "config.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_splitk_xdl.hpp"
-#include "device_gemm_instance.hpp"
 #include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -22,7 +21,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using device_gemm_xdl_instance_f32_f32_f32_km_nk_mn =
+using device_gemm_xdl_f32_f32_f32_km_nk_mn_instances =
    std::tuple<
        // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
@@ -40,21 +39,10 @@ using device_gemm_xdl_instance_f32_f32_f32_km_nk_mn =
        // clang-format on
        >;

-template <>
-void add_device_gemm_instance<F32, F32, F32, Col, Col, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
+void add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_km_nk_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
+    add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_km_nk_mn_instances{});
 }

 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn.cpp
 #include <stdlib.h>
 #include "config.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_splitk_xdl.hpp"
-#include "device_gemm_instance.hpp"
 #include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -22,7 +21,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn =
+using device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances =
    std::tuple<
        // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
@@ -40,21 +39,10 @@ using device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn =
        // clang-format on
        >;

-template <>
-void add_device_gemm_instance<F32, F32, F32, Row, Row, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
+void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
+    add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances{});
 }

 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn.cpp
 #include <stdlib.h>
 #include "config.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_splitk_xdl.hpp"
-#include "device_gemm_instance.hpp"
 #include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -22,7 +21,7 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-using device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn =
+using device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances =
    std::tuple<
        // clang-format off
        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
@@ -45,21 +44,10 @@ using device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn =
        // clang-format on
        >;

-template <>
-void add_device_gemm_instance<F32, F32, F32, Row, Col, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
+void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
 {
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
+    add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances{});
 }

 } // namespace device_gemm_instance

--- a/device_operation/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
+++ b/device_operation/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_gemm_xdl_splitk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              1,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances{});
+}
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/device_operation/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
+++ b/device_operation/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_gemm_xdl_splitk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              1,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances{});
+}
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/device_operation/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
+++ b/device_operation/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_gemm_xdl_splitk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    96,   128,     4,  8,   16,   16,    3,    4,  S<1, 4, 32, 2>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>
+    >;
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances{});
+}
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/device_operation/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
+++ b/device_operation/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_gemm_xdl_splitk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
+        DeviceGemmXdlSplitK<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances{});
+}
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_kn_mn.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_gemm_xdl.hpp"
-#include "device_gemm_splitk_xdl.hpp"
-#include "device_gemm_instance.hpp"
-#include "element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using device_gemm_xdl_instance_f32_f32_f32_km_kn_mn =
-    std::tuple<
-        // clang-format off
-        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-        //##########|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row,  PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,      S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,              2,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row,  PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,      S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,              2,              2,              4,      true,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row,  PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,              2,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row,  PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,              2,              2,              4,      true,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row,  PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,              2,              4,              4,      true,      S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row,  PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,      S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,              2,              2,              4,      true,      S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row,  PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,      S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,              2,              2,              4,      true,      S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              1,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row,  PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,      S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,              2,              1,              4,      true,      S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>
-        // clang-format on
-        >;
-
-template <>
-void add_device_splitk_gemm_instance<F32, F32, F32, Col, Row, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
-{
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_km_kn_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
-}
-
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_nk_mn.cpp
+++ b/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_nk_mn.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_gemm_xdl.hpp"
-#include "device_gemm_splitk_xdl.hpp"
-#include "device_gemm_instance.hpp"
-#include "element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using device_gemm_xdl_instance_f32_f32_f32_km_nk_mn =
-    std::tuple<
-        // clang-format off
-        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-        //##########|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,           S<1, 4, 64, 1>,     S<0, 1, 3, 2> ,     S<0, 1, 3, 2> ,              2,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<1, 4, 64, 1>,     S<0, 1, 3, 2> ,     S<0, 1, 3, 2> ,              2,              2,              4,      true,           S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,           S<1, 4, 32, 1>,     S<0, 1, 3, 2> ,     S<0, 1, 3, 2> ,              2,              4,              4,      true,           S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<1, 4, 64, 1>,     S<0, 1, 3, 2> ,     S<0, 1, 3, 2> ,              2,              2,              4,      true,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,           S<1, 4, 32, 1>,     S<0, 1, 3, 2> ,     S<0, 1, 3, 2> ,              2,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<1, 4, 32, 1>,     S<0, 1, 3, 2> ,     S<0, 1, 3, 2> ,              2,              2,              4,      true,           S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<1, 4, 64, 1>,     S<0, 1, 3, 2> ,     S<0, 1, 3, 2> ,              2,              2,              4,      true,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<1, 4, 64, 1>,     S<0, 1, 3, 2> ,     S<0, 1, 3, 2> ,              2,              1,              4,      true,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>
-        // clang-format on
-        >;
-
-template <>
-void add_device_splitk_gemm_instance<F32, F32, F32, Col, Col, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
-{
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_km_nk_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
-}
-
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_kn_mn.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_gemm_xdl.hpp"
-#include "device_gemm_splitk_xdl.hpp"
-#include "device_gemm_instance.hpp"
-#include "element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn =
-    std::tuple<
-        // clang-format off
-        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-        //##########|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   96,   128,     4,  8,   16,   16,    3,    4,     S<1, 4, 32, 2>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              1,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,      true,               7,               1>
-    >;
-
-template <>
-void add_device_splitk_gemm_instance<F32, F32, F32, Row, Row, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
-{
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
-}
-
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_nk_mn.cpp
+++ b/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_nk_mn.cpp
-#include <stdlib.h>
-#include "config.hpp"
-#include "device_gemm_xdl.hpp"
-#include "device_gemm_splitk_xdl.hpp"
-#include "device_gemm_instance.hpp"
-#include "element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-
-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
-using device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn =
-    std::tuple<
-        // clang-format off
-        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
-        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
-        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
-        //##########|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<1, 4, 16, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 16, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 32, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<1, 4, 16, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 16, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>,
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row,  PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<1, 4, 16, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      true,     S<1, 4, 16, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,             3,              4,              4,      true,               7,               1>
-        // clang-format on
-        >;
-
-template <>
-void add_device_splitk_gemm_instance<F32, F32, F32, Row, Col, Row>(
-    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& device_op_instances)
-{
-    using DeviceGemms = device_gemm_instance::device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn;
-
-    const auto device_gemms = DeviceGemms{};
-
-    ck::static_for<0, std::tuple_size_v<DeviceGemms>, 1>{}([&](auto i) {
-        using Gemm = remove_cvref_t<decltype(std::get<i>(device_gemms))>;
-
-        auto gemm = Gemm{};
-
-        device_op_instances.push_back(std::make_unique<Gemm>(gemm));
-    });
-}
-
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/device_operation/include/device_gemm_instance.hpp
+++ b/device_operation/include/device_gemm_instance.hpp
-#ifndef DEVICE_GEMM_INSTANTCE_HPP
-#define DEVICE_GEMM_INSTANTCE_HPP
-
-#include "device_gemm.hpp"
-#include "element_wise_operation.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-
-using DeviceGemmNoOpPtr = DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::PassThrough>;
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-void add_device_gemm_instance(
-    std::vector<DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
-                              ck::tensor_operation::element_wise::PassThrough,
-                              ck::tensor_operation::element_wise::PassThrough>>&);
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-void add_device_splitk_gemm_instance(
-    std::vector<DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
-                              ck::tensor_operation::element_wise::PassThrough,
-                              ck::tensor_operation::element_wise::PassThrough>>&);
-
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-#endif
--- a/device_operation/include/device_gemm_splitk_xdl_instance.hpp
+++ b/device_operation/include/device_gemm_splitk_xdl_instance.hpp
-#ifndef DEVICE_GEMM_XDL_SPLITK_INSTANCE
-#define DEVICE_GEMM_XDL_SPLITK_INSTANCE
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-
-template <>
-void add_device_splitk_gemm_instance<float,
-                                     float,
-                                     float,
-                                     ck::tensor_layout::gemm::RowMajor,
-                                     ck::tensor_layout::gemm::RowMajor,
-                                     ck::tensor_layout::gemm::RowMajor>(
-    std::vector<DeviceGemmNoOpPtr>&);
-
-template <>
-void add_device_splitk_gemm_instance<float,
-                                     float,
-                                     float,
-                                     ck::tensor_layout::gemm::RowMajor,
-                                     ck::tensor_layout::gemm::ColumnMajor,
-                                     ck::tensor_layout::gemm::RowMajor>(
-    std::vector<DeviceGemmNoOpPtr>&);
-
-template <>
-void add_device_splitk_gemm_instance<float,
-                                     float,
-                                     float,
-                                     ck::tensor_layout::gemm::ColumnMajor,
-                                     ck::tensor_layout::gemm::RowMajor,
-                                     ck::tensor_layout::gemm::RowMajor>(
-    std::vector<DeviceGemmNoOpPtr>&);
-
-template <>
-void add_device_splitk_gemm_instance<float,
-                                     float,
-                                     float,
-                                     ck::tensor_layout::gemm::ColumnMajor,
-                                     ck::tensor_layout::gemm::ColumnMajor,
-                                     ck::tensor_layout::gemm::RowMajor>(
-    std::vector<DeviceGemmNoOpPtr>&);
-
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-#endif
--- a/device_operation/include/device_gemm_splitk_xdl.hpp
+++ b/device_operation/include/device_gemm_splitk_xdl.hpp
@@ -55,7 +55,7 @@ template <typename ADataType,
          bool BBlockLdsAddExtraN,
          ck::index_t CThreadTransferSrcDstVectorDim,
          ck::index_t CThreadTransferDstScalarPerVector>
-struct DeviceGemmSplitKXdl
+struct DeviceGemmXdlSplitK
    : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
 {
    static constexpr auto I0 = Number<0>{};
@@ -276,13 +276,13 @@ struct DeviceGemmSplitKXdl
              c_element_op_{c_element_op},
              k_batch_{k_batch}
        {
-            int KPad = DeviceGemmSplitKXdl::GetKPad(K, k_batch_);
+            int KPad = DeviceGemmXdlSplitK::GetKPad(K, k_batch_);

-            a_grid_desc_kbatch_k0_m_k1_ = DeviceGemmSplitKXdl::MakeAGridDescriptor_KBatch_K0_M_K1(
+            a_grid_desc_kbatch_k0_m_k1_ = DeviceGemmXdlSplitK::MakeAGridDescriptor_KBatch_K0_M_K1(
                M, K, StrideA, k_batch_, KPad);
-            b_grid_desc_kbatch_k0_n_k1_ = DeviceGemmSplitKXdl::MakeBGridDescriptor_KBatch_K0_N_K1(
+            b_grid_desc_kbatch_k0_n_k1_ = DeviceGemmXdlSplitK::MakeBGridDescriptor_KBatch_K0_N_K1(
                K, N, StrideB, k_batch_, KPad);
-            c_grid_desc_m_n_ = DeviceGemmSplitKXdl::MakeCGridDescriptor_M_N(M, N, StrideC);
+            c_grid_desc_m_n_ = DeviceGemmXdlSplitK::MakeCGridDescriptor_M_N(M, N, StrideC);

            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
                                           b_grid_desc_kbatch_k0_n_k1_,
@@ -318,7 +318,7 @@ struct DeviceGemmSplitKXdl
    // Invoker
    struct Invoker : public BaseInvoker
    {
-        using Argument = DeviceGemmSplitKXdl::Argument;
+        using Argument = DeviceGemmXdlSplitK::Argument;

        void ShowInfo(const Argument& arg)
        {
@@ -412,13 +412,13 @@ struct DeviceGemmSplitKXdl
                        GridwiseGemm,
                        ADataType, // TODO: distiguish A/B datatype
                        CDataType,
-                        remove_reference_t<DeviceGemmSplitKXdl::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmSplitKXdl::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmSplitKXdl::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                        remove_reference_t<DeviceGemmXdlSplitK::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitK::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitK::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
                        AElementwiseOperation,
                        BElementwiseOperation,
                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmSplitKXdl::Block2CTileMap>,
+                        remove_reference_t<DeviceGemmXdlSplitK::Block2CTileMap>,
                        true>;

                    Run(kernel);
@@ -429,13 +429,13 @@ struct DeviceGemmSplitKXdl
                        GridwiseGemmAtomicAdd,
                        ADataType, // TODO: distiguish A/B datatype
                        CDataType,
-                        remove_reference_t<DeviceGemmSplitKXdl::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmSplitKXdl::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmSplitKXdl::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                        remove_reference_t<DeviceGemmXdlSplitK::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitK::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitK::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
                        AElementwiseOperation,
                        BElementwiseOperation,
                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmSplitKXdl::Block2CTileMap>,
+                        remove_reference_t<DeviceGemmXdlSplitK::Block2CTileMap>,
                        true>;

                    Run(kernel);
@@ -449,13 +449,13 @@ struct DeviceGemmSplitKXdl
                        GridwiseGemm,
                        ADataType, // TODO: distiguish A/B datatype
                        CDataType,
-                        remove_reference_t<DeviceGemmSplitKXdl::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmSplitKXdl::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmSplitKXdl::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                        remove_reference_t<DeviceGemmXdlSplitK::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitK::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitK::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
                        AElementwiseOperation,
                        BElementwiseOperation,
                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmSplitKXdl::Block2CTileMap>,
+                        remove_reference_t<DeviceGemmXdlSplitK::Block2CTileMap>,
                        false>;

                    Run(kernel);
@@ -466,13 +466,13 @@ struct DeviceGemmSplitKXdl
                        GridwiseGemmAtomicAdd,
                        ADataType, // TODO: distiguish A/B datatype
                        CDataType,
-                        remove_reference_t<DeviceGemmSplitKXdl::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceGemmSplitKXdl::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceGemmSplitKXdl::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                        remove_reference_t<DeviceGemmXdlSplitK::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitK::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitK::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
                        AElementwiseOperation,
                        BElementwiseOperation,
                        CElementwiseOperation,
-                        remove_reference_t<DeviceGemmSplitKXdl::Block2CTileMap>,
+                        remove_reference_t<DeviceGemmXdlSplitK::Block2CTileMap>,
                        false>;

                    Run(kernel);

--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -14,18 +14,18 @@ include_directories(BEFORE

 # device_gemm_instance
 set(DEVICE_GEMM_INSTANCE_SOURCE 
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_nk_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f32_f32_f32_km_kn_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f32_f32_f32_km_nk_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f16_f16_f16_mk_kn_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f16_f16_f16_mk_nk_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_kn_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_instance_f16_f16_f16_km_nk_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_kn_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_mk_nk_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_kn_mn.cpp;
-   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_splitk_instance_f32_f32_f32_km_nk_mn.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp;
 ) 

 add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE}) 
@@ -92,7 +92,7 @@ add_executable(ckProfiler ${PROFILER_SOURCE})

 target_link_libraries(ckProfiler PRIVATE host_tensor)
 target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
-target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
+#target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
+#target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
+#target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
+#target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)