merge upstream

4525c5d7 · coderfeli · a8d88d8d · 44828b7c · 4525c5d7 · 4525c5d7
Commit 4525c5d7 authored Dec 02, 2024 by coderfeli
20 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using Empty_Tuple = ck::Tuple<>;
-
-using PassThrough                 = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-
-// a[m, k] * b[n, k] = e[m, n]
-using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-    // clang-format off
-        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
-    // clang-format on
-    >;
-
 void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
                                                  Col,
@@ -65,8 +22,8 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{});
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances<F16, GemmDefault>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp"
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F16 = ck::half_t;
-using F32 = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using Empty_Tuple = ck::Tuple<>;
-
-using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-using device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple<
-    // clang-format off
-        //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 48, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,    
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   192,    32,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   192,    32,   8,   8,   32,   32,    1,    3,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,   256,    32,   8,   8,   32,   32,    1,    4,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    32,    64,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    32,    32,   8,   8,   32,   32,    1,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
-        DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>
-    // clang-format on
-    >;
-
 void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances(
    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
                                                  Col,
@@ -72,7 +23,7 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
                                                  PassThrough>>>& instances)
 {
    add_device_operation_instances(
-        instances, device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_tile_instances{});
+        instances, device_grouped_gemm_xdl_splitk_2Bt_rcr_instances<F16, GemmMNKPadding>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instance.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp"
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-using BF16 = ck::bhalf_t;
-using I8   = int8_t;
-using F32  = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using PassThrough         = ck::tensor_operation::element_wise::PassThrough;
-using Multiply            = ck::tensor_operation::element_wise::Multiply;
-using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
-using MultiplyFastGelu    = ck::tensor_operation::element_wise::MultiplyFastGelu;
-using MultiplyAdd         = ck::tensor_operation::element_wise::MultiplyAdd;
-
-static constexpr auto GemmDefault    = GemmSpecialization::Default;
-static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
-static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
-static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
-
-static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
-static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
-
-template <typename DsLayout,
-          typename DsDataType,
-          typename CDEElementwiseOp,
-          GemmSpecialization GemmSpec = GemmMNKPadding>
-using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
-        //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //###########################################|       |       |            |       |      |      |        |         |            |      |            |            |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |   S<C,D0...,D_N| 
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   224,   256,    64,   8,   4,   16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,         0,           1,           2,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   256,    32,   8,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>
-
-    // clang-format on
-    >;
-
-template <typename DsLayout,
-          typename DsDataType,
-          typename CDEElementwiseOp,
-          GemmSpecialization GemmSpec                 = GemmMNKPadding,
-          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave>
-using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances =
-    std::tuple<
-        // clang-format off
-        //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //###########################################|       |       |            |       |      |      |        |         |            |      |            |            |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |   S<C,D0...,D_N| 
-        // Latency friendly
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,    64,    16,    16,   256,   8,   4,   16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 4>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,    32,   256,   8,   4,   16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        // Memory friendly
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,    64,    16,    16,   256,   8,   4,   16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 4>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,    32,   256,   8,   4,   16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,    64,   128,   8,   4,   16,   16,    1,    2,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    32,    64,   128,   8,   4,   32,   32,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<8,8,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,   128,    64,   8,   4,   16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    32,   128,    64,   8,   4,   32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<8,8,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,    16,   256,    64,   8,   4,   16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,              S<1, 16, 1, 16>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,    32,   256,    64,   8,   4,   32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,              S<1, 16, 1, 16>,        S<8,8,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
-        // clang-format on
-        >;
-
-void add_device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemmTileLoop<Row,
-                                                          Row,
-                                                          ck::Tuple<Row>,
-                                                          Row,
-                                                          BF16,
-                                                          I8,
-                                                          ck::Tuple<BF16>,
-                                                          BF16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          Multiply>>>& instances)
-{
-    // comp
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<Row>,
-                                                                               ck::Tuple<BF16>,
-                                                                               Multiply,
-                                                                               GemmDefault>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<Row>,
-                                                                               ck::Tuple<BF16>,
-                                                                               Multiply,
-                                                                               GemmMNKPadding>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<Row>,
-                                                                               ck::Tuple<BF16>,
-                                                                               Multiply,
-                                                                               GemmMNPadding>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances<ck::Tuple<Row>,
-                                                                               ck::Tuple<BF16>,
-                                                                               Multiply,
-                                                                               GemmKPadding>{});
-    // mem
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmDefault,
-                                                                              Intrawave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmMNKPadding,
-                                                                              Intrawave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmMNPadding,
-                                                                              Intrawave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmKPadding,
-                                                                              Intrawave>{});
-
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmDefault,
-                                                                              Interwave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmMNKPadding,
-                                                                              Interwave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmMNPadding,
-                                                                              Interwave>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances<ck::Tuple<Row>,
-                                                                              ck::Tuple<BF16>,
-                                                                              Multiply,
-                                                                              GemmKPadding,
-                                                                              Interwave>{});
-}
-
-void add_device_grouped_gemm_xdl_tile_loop_multiply_bias_fastgelu_bf16_i8_bf16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemmTileLoop<Row,
-                                                          Row,
-                                                          ck::Tuple<Row, Row>,
-                                                          Row,
-                                                          BF16,
-                                                          I8,
-                                                          ck::Tuple<BF16, BF16>,
-                                                          BF16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          MultiplyAddFastGelu>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_irregular_tile_instances<
-            ck::Tuple<Row, Row>,
-            ck::Tuple<BF16, BF16>,
-            MultiplyAddFastGelu>{});
-}
-
-void add_device_grouped_gemm_xdl_tile_loop_multiply_fastgelu_bf16_i8_bf16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGroupedGemmTileLoop<Row,
-                                                          Row,
-                                                          ck::Tuple<Row>,
-                                                          Row,
-                                                          BF16,
-                                                          I8,
-                                                          ck::Tuple<BF16>,
-                                                          BF16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          MultiplyFastGelu>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_irregular_tile_instances<
-            ck::Tuple<Row>,
-            ck::Tuple<BF16>,
-            MultiplyFastGelu>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/modified_files.txt
+++ b/modified_files.txt
+example/01_gemm/gemm_xdl_fp8_streamk_v3.cpp
+example/01_gemm/run_gemm_example_streamk_v2.inc
+include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
+include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f8_f16/device_gemm_xdl_universal_streamk_f16_f8_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_comp_mnpadding_instance.cpp
+profiler/src/profile_gemm_universal_streamk.cpp
+modified_files.txt
--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -17,7 +17,6 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
 #include "ck/library/utility/fill.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
@@ -42,11 +41,14 @@ bool profile_grouped_gemm_impl(int do_verification,
                               const std::vector<int>& StrideAs,
                               const std::vector<int>& StrideBs,
                               const std::vector<int>& StrideCs,
-                               int kbatch   = 1,
-                               int n_warmup = 1,
-                               int n_iter   = 10)
+                               const std::vector<int>& kbatches = {},
+                               int n_warmup                     = 1,
+                               int n_iter                       = 10)
 {
    bool pass = true;
+    // TODO: Fixme - we do not pass compute data type here but need it
+    // to compute error thresholds.
+    using ComputeDataType = ADataType;

    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
@@ -75,6 +77,7 @@ bool profile_grouped_gemm_impl(int do_verification,
    std::vector<Tensor<CDataType>> c_m_n_host_results;
    std::vector<Tensor<CDataType>> c_m_n_device_results;

+    ComputeDataType max_abs_in_val = 0.f;
    for(std::size_t i = 0; i < group_count; i++)
    {
        a_m_k.push_back(
@@ -93,17 +96,18 @@ bool profile_grouped_gemm_impl(int do_verification,
                      << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
                      << "]:" << c_m_n_device_results[i].mDesc << std::endl;
        }
-        std::size_t num_thread = 1;
        switch(init_method)
        {
        case 0: break;
        case 1:
-            a_m_k[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
-            b_k_n[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+            ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k[i]);
+            ck::utils::FillUniformDistributionIntegerValue<BDataType>{-2.f, 2.f}(b_k_n[i]);
+            max_abs_in_val = 2.f;
            break;
        default:
-            a_m_k[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
-            b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
+            ck::utils::FillUniformDistribution<ADataType>{-0.5f, 0.5f}(a_m_k[i]);
+            ck::utils::FillUniformDistribution<BDataType>{-0.5f, 0.5f}(b_k_n[i]);
+            max_abs_in_val = 0.5f;
        }
    }

@@ -164,7 +168,20 @@ bool profile_grouped_gemm_impl(int do_verification,
                                                                     BElementOp,
                                                                     CElementOp>;

-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+    // If kbatch would be bigger than 1, then we will use SplitK version.
+    using DeviceOpSplitK = ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
+                                                                                 BLayout,
+                                                                                 ck::Tuple<>,
+                                                                                 CLayout,
+                                                                                 ADataType,
+                                                                                 BDataType,
+                                                                                 ck::Tuple<>,
+                                                                                 CDataType,
+                                                                                 AElementOp,
+                                                                                 BElementOp,
+                                                                                 CElementOp>;
+
+    auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

    if(op_ptrs.size() <= 0)
@@ -205,7 +222,6 @@ bool profile_grouped_gemm_impl(int do_verification,
            ref_invoker.Run(ref_argument);
        }
    }
-
    // profile device GEMM instances
    for(auto& gemm_ptr : op_ptrs)
    {
@@ -221,43 +237,44 @@ bool profile_grouped_gemm_impl(int do_verification,

        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();

-        DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
+        std::size_t workspace_size = gemm_ptr->GetWorkSpaceSize(argument_ptr.get());
+        std::size_t kargs_size     = gemm_ptr->GetDeviceKernelArgSize(argument_ptr.get());

-        gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
-        std::string gemm_name = gemm_ptr->GetTypeString();
+        DeviceMem gemm_workspace, gemm_kargs;

-        using DeviceOpSplitK = ck::tensor_operation::device::DeviceGroupedGemmSplitK<ALayout,
-                                                                                     BLayout,
-                                                                                     ck::Tuple<>,
-                                                                                     CLayout,
-                                                                                     ADataType,
-                                                                                     BDataType,
-                                                                                     ck::Tuple<>,
-                                                                                     CDataType,
-                                                                                     AElementOp,
-                                                                                     BElementOp,
-                                                                                     CElementOp>;
-
-        // skip non-splitk grouped_gemm
-        if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) == nullptr)
+        // The following is necessary since TwoStage kernel is using additional memory both
+        // for Workspace and kernel arguments.
+        if(kargs_size > 0)
        {
-            continue;
+            gemm_kargs.Realloc(kargs_size);
+            gemm_ptr->SetDeviceKernelArgs(argument_ptr.get(), gemm_kargs.GetDeviceBuffer());
+        }
+        if(workspace_size > 0 && workspace_size != kargs_size)
+        {
+            gemm_workspace.Realloc(workspace_size);
+            gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_workspace.GetDeviceBuffer());
        }

+        std::string gemm_name = gemm_ptr->GetTypeString();
+
        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64};

-        if(kbatch > 0)
+        // If the user will provide not empty kbatches list, then we test predefined set of kbatch
+        // values.
+        if(!kbatches.empty())
        {
-            kbatch_list = {kbatch};
+            kbatch_list = kbatches;
        }

        for(std::size_t j = 0; j < kbatch_list.size(); j++)
        {
-
            auto kbatch_curr = kbatch_list[j];

-            dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                ->SetKBatchSize(argument_ptr.get(), kbatch_curr);
+            if(kbatch_curr > 1 && dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) != nullptr)
+            {
+                dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
+                    ->SetKBatchSize(argument_ptr.get(), kbatch_curr);
+            }

            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
            {
@@ -272,23 +289,18 @@ bool profile_grouped_gemm_impl(int do_verification,
                    bool instance_pass = true;
                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
                    {
-
                        c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
-
-                        if(std::is_same_v<CDataType, ck::half_t> && kbatch_curr > 1)
-                        {
-                            instance_pass =
-                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
-                                                                      c_m_n_host_results[i],
-                                                                      "Error: Incorrect results!",
-                                                                      0.06);
-                        }
-                        else
-                        {
-                            instance_pass =
-                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
-                                                                      c_m_n_host_results[i]);
-                        }
+                        auto atol = ck::utils::get_absolute_threshold<ComputeDataType, CDataType>(
+                            max_abs_in_val, gemm_descs[i].K_);
+                        auto rtol = ck::utils::get_relative_threshold<ComputeDataType, CDataType>(
+                            gemm_descs[i].K_);
+
+                        instance_pass =
+                            instance_pass && ck::utils::check_err(c_m_n_device_results[i],
+                                                                  c_m_n_host_results[i],
+                                                                  "Error: Incorrect results!",
+                                                                  rtol,
+                                                                  atol);

                        if(do_log)
                        {
@@ -311,11 +323,12 @@ bool profile_grouped_gemm_impl(int do_verification,
                    pass = pass && instance_pass;
                }

-                float ave_time = invoker_ptr->Run(
-                    argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});
-
                if(time_kernel)
                {
+                    float ave_time =
+                        invoker_ptr->Run(argument_ptr.get(),
+                                         StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});
+
                    std::size_t flop = 0, num_btype = 0;
                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
                    {

--- a/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_multiply_tile_loop_impl.hpp
@@ -143,8 +143,7 @@ bool profile_grouped_gemm_multiply_tile_loop_impl(int do_verification,
    p_ds.reserve(group_count);
    p_e.reserve(group_count);

-    using KernelArguments =
-        ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<NumDTensor>;
+    using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument<NumDTensor>;

    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
    std::vector<KernelArguments> gemm_kargs;

--- a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
@@ -127,7 +127,7 @@ bool profile_grouped_gemm_tile_loop_impl(int do_verification,
    p_b.reserve(group_count);
    p_c.reserve(group_count);

-    using KernelArguments = ck::tensor_operation::device::GroupedGemmTileLoopKernelArguments<>;
+    using KernelArguments = ck::tensor_operation::device::GroupedGemmKernelArgument<>;

    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
    std::vector<KernelArguments> gemm_kargs;

--- a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iomanip>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/utility/fill.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-
-namespace ck {
-namespace profiler {
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AccDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout>
-bool profile_grouped_gemm_two_stage_impl(int do_verification,
-                                         int init_method,
-                                         bool do_log,
-                                         bool time_kernel,
-                                         const std::vector<int>& Ms,
-                                         const std::vector<int>& Ns,
-                                         const std::vector<int>& Ks,
-                                         const std::vector<int>& StrideAs,
-                                         const std::vector<int>& StrideBs,
-                                         const std::vector<int>& StrideCs,
-                                         int kbatch   = 1,
-                                         int n_warmup = 1,
-                                         int n_iter   = 10)
-{
-    bool pass = true;
-
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            using namespace ck::literals;
-
-            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-
-    std::size_t group_count = Ms.size();
-
-    if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() &&
-         group_count == StrideBs.size() && group_count == StrideCs.size()))
-    {
-        throw std::runtime_error("wrong! inconsistent M/N/Ks, StrideA/B/Cs size\n");
-    }
-
-    std::vector<Tensor<ADataType>> a_m_k;
-    std::vector<Tensor<BDataType>> b_k_n;
-    std::vector<Tensor<CDataType>> c_m_n_host_results;
-    std::vector<Tensor<CDataType>> c_m_n_device_results;
-
-    for(std::size_t i = 0; i < group_count; i++)
-    {
-        a_m_k.push_back(
-            Tensor<ADataType>(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{})));
-        b_k_n.push_back(
-            Tensor<BDataType>(f_host_tensor_descriptor(Ks[i], Ns[i], StrideBs[i], BLayout{})));
-
-        c_m_n_device_results.push_back(
-            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-
-        c_m_n_host_results.push_back(
-            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-        if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-        {
-            std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n["
-                      << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
-                      << "]:" << c_m_n_device_results[i].mDesc << std::endl;
-        }
-        std::size_t num_thread = 1;
-        switch(init_method)
-        {
-        case 0: break;
-        case 1:
-            a_m_k[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
-            b_k_n[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
-            break;
-        default:
-            a_m_k[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
-            b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
-        }
-    }
-
-    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-    const auto a_element_op = AElementOp{};
-    const auto b_element_op = BElementOp{};
-    const auto c_element_op = CElementOp{};
-
-    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
-    std::vector<DeviceMemPtr> a_device_buf, b_device_buf, c_device_buf;
-
-    a_device_buf.reserve(group_count);
-    b_device_buf.reserve(group_count);
-    c_device_buf.reserve(group_count);
-
-    std::vector<const void*> p_a, p_b;
-    std::vector<void*> p_c;
-
-    p_a.reserve(group_count);
-    p_b.reserve(group_count);
-    p_c.reserve(group_count);
-
-    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
-
-    gemm_descs.reserve(group_count);
-
-    for(std::size_t i = 0; i < group_count; i++)
-    {
-        a_device_buf.emplace_back(
-            std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
-        b_device_buf.emplace_back(
-            std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
-        c_device_buf.emplace_back(std::make_unique<DeviceMem>(
-            sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
-
-        a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
-        b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
-
-        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
-
-        p_a.push_back(a_device_buf[i]->GetDeviceBuffer());
-        p_b.push_back(b_device_buf[i]->GetDeviceBuffer());
-        p_c.push_back(c_device_buf[i]->GetDeviceBuffer());
-    }
-
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm<ALayout,
-                                                                     BLayout,
-                                                                     ck::Tuple<>,
-                                                                     CLayout,
-                                                                     ADataType,
-                                                                     BDataType,
-                                                                     ck::Tuple<>,
-                                                                     CDataType,
-                                                                     AElementOp,
-                                                                     BElementOp,
-                                                                     CElementOp>;
-
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-
-    if(op_ptrs.size() <= 0)
-    {
-        throw std::runtime_error("wrong! no device GEMM instance found");
-    }
-
-    std::string best_gemm_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-    float best_kbatch     = 0;
-
-    auto p_ds = std::vector<std::array<const void*, 0>>{};
-
-    if(do_verification)
-    {
-        for(std::size_t i = 0; i < gemm_descs.size(); i++)
-        {
-            using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                                    BDataType,
-                                                                                    CDataType,
-                                                                                    AccDataType,
-                                                                                    AElementOp,
-                                                                                    BElementOp,
-                                                                                    CElementOp>;
-
-            auto ref_gemm    = ReferenceGemmInstance{};
-            auto ref_invoker = ref_gemm.MakeInvoker();
-
-            auto ref_argument = ref_gemm.MakeArgument(a_m_k[i],
-                                                      b_k_n[i],
-                                                      c_m_n_host_results[i],
-                                                      a_element_op,
-                                                      b_element_op,
-                                                      c_element_op);
-
-            ref_invoker.Run(ref_argument);
-        }
-    }
-
-    // profile device GEMM instances
-    for(auto& gemm_ptr : op_ptrs)
-    {
-        auto argument_ptr =
-            gemm_ptr->MakeArgumentPointer(p_a,
-                                          p_b,
-                                          p_ds,
-                                          p_c,
-                                          gemm_descs,
-                                          ck::tensor_operation::element_wise::PassThrough{},
-                                          ck::tensor_operation::element_wise::PassThrough{},
-                                          ck::tensor_operation::element_wise::PassThrough{});
-
-        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
-
-        DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
-        gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
-
-        std::string gemm_name = gemm_ptr->GetTypeString();
-
-        using DeviceOpSplitK =
-            ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitK<ALayout,
-                                                                           BLayout,
-                                                                           ck::Tuple<>,
-                                                                           CLayout,
-                                                                           ADataType,
-                                                                           BDataType,
-                                                                           ck::Tuple<>,
-                                                                           CDataType,
-                                                                           AElementOp,
-                                                                           BElementOp,
-                                                                           CElementOp>;
-
-        // skip non-splitk grouped_gemm
-        if(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get()) == nullptr)
-        {
-            continue;
-        }
-
-        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64};
-
-        if(kbatch > 0)
-        {
-            kbatch_list = {kbatch};
-        }
-
-        for(std::size_t j = 0; j < kbatch_list.size(); j++)
-        {
-
-            auto kbatch_curr = kbatch_list[j];
-            dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                ->SetKBatchSize(argument_ptr.get(), kbatch_curr);
-
-            DeviceMem gemm_arg_dev_mem(dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                                           ->GetDeviceKernelArgSize(argument_ptr.get()));
-            dynamic_cast<DeviceOpSplitK*>(gemm_ptr.get())
-                ->SetDeviceKernelArgs(argument_ptr.get(), gemm_arg_dev_mem.GetDeviceBuffer());
-
-            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
-            {
-                gemm_desc_workspace.SetZero();
-                for(std::size_t i = 0; i < gemm_descs.size(); i++)
-                    c_device_buf[i]->SetZero();
-
-                invoker_ptr->Run(argument_ptr.get(),
-                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
-                if(do_verification)
-                {
-                    bool instance_pass = true;
-                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
-                    {
-                        c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
-                        if(std::is_same_v<CDataType, ck::half_t> && kbatch_curr > 1)
-                        {
-                            instance_pass =
-                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
-                                                                      c_m_n_host_results[i],
-                                                                      "Error: Incorrect results!",
-                                                                      0.06);
-                        }
-                        else
-                        {
-                            instance_pass =
-                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
-                                                                      c_m_n_host_results[i]);
-                        }
-
-                        if(do_log)
-                        {
-                            LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(
-                                std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(
-                                std::cout << "c_host  : ", c_m_n_host_results[i].mData, ",")
-                                << std::endl;
-                        }
-                    }
-
-                    std::cout << "Instance: " << gemm_name << " verification "
-                              << (instance_pass ? "SUCCEED" : "FAILED") << std::endl;
-
-                    pass = pass && instance_pass;
-                }
-                float ave_time = invoker_ptr->Run(
-                    argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});
-                if(time_kernel)
-                {
-                    std::size_t flop = 0, num_btype = 0;
-                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
-                    {
-                        flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
-
-                        num_btype += sizeof(ADataType) * Ms[i] * Ks[i] +
-                                     sizeof(BDataType) * Ks[i] * Ns[i] +
-                                     sizeof(CDataType) * Ms[i] * Ns[i];
-                    }
-
-                    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-                    float gb_per_sec = num_btype / 1.E6 / ave_time;
-                    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
-                              << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << ", KBatch "
-                              << kbatch_curr << std::endl;
-
-                    if(tflops > best_tflops)
-                    {
-                        best_gemm_name  = gemm_name;
-                        best_tflops     = tflops;
-                        best_ave_time   = ave_time;
-                        best_gb_per_sec = gb_per_sec;
-                        best_kbatch     = kbatch_curr;
-                    }
-                }
-            }
-            else
-            {
-                std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem"
-                          << std::endl;
-            }
-        }
-    }
-
-    if(time_kernel)
-    {
-        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-                  << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch
-                  << std::endl;
-    }
-
-    return pass;
-}
-
-} // namespace profiler
-} // namespace ck
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -43,7 +43,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
    list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
    list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
    list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
-    list(APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp)
    list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
    list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp)
    list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)

--- a/profiler/src/profile_gemm_universal_streamk.cpp
+++ b/profiler/src/profile_gemm_universal_streamk.cpp
@@ -85,8 +85,10 @@ int profile_gemm_universal_streamk(int argc, char* argv[])

    using F32 = float;
    using F16 = ck::half_t;
-    // using BF16 = ck::bhalf_t;
-    // using F8   = ck::f8_t;
+
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+    using F8 = ck::f8_t;
+#endif

    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -145,6 +147,24 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
    {
        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
    }
+#if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+#endif
    else
    {
        std::cout << "this data_type & layout is not implemented" << std::endl;

--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -39,16 +39,13 @@ namespace {
 std::vector<int> argToIntArray(char* input)
 {
    std::vector<int> out;
-
    std::istringstream in(input);
-
    std::string item;

    while(std::getline(in, item, ','))
    {
        out.push_back(std::stoi(item));
    }
-
    return out;
 }

@@ -69,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
            << "arg7: time kernel (0=n0, 1=yes)\n"
            << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
               "64,64 64,64 128,128)\n"
-            << "arg15: kbatch value (default 1)\n"
+            << "arg15: kbatch values (default 1)\n"
            << "optional:\n"
            << "arg16: number of warm-up cycles (default 1)\n"
            << "arg17: number of iterations (default 10)\n"
@@ -92,7 +89,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    const auto StrideAs = argToIntArray(argv[11]);
    const auto StrideBs = argToIntArray(argv[12]);
    const auto StrideCs = argToIntArray(argv[13]);
-    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;
+    const auto kbatches = argc >= 15 ? argToIntArray(argv[14]) : std::vector<int>{};

    int n_warmup = 1;
    int n_iter   = 10;
@@ -102,7 +99,6 @@ int profile_grouped_gemm(int argc, char* argv[])
        n_iter   = std::stoi(argv[16]);
    }

-#ifdef CK_ENABLE_FP16
    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
@@ -121,7 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                   StrideAs,
                                                                                   StrideBs,
                                                                                   StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                   n_warmup,
                                                                                   n_iter);
    }
@@ -143,7 +139,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                   StrideAs,
                                                                                   StrideBs,
                                                                                   StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                   n_warmup,
                                                                                   n_iter);
    }
@@ -165,7 +161,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                   StrideAs,
                                                                                   StrideBs,
                                                                                   StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                   n_warmup,
                                                                                   n_iter);
    }
@@ -187,7 +183,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                   StrideAs,
                                                                                   StrideBs,
                                                                                   StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                   n_warmup,
                                                                                   n_iter);
    }
@@ -209,7 +205,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                   StrideAs,
                                                                                   StrideBs,
                                                                                   StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
                                                                                   n_warmup,
                                                                                   n_iter);
    }
@@ -231,7 +227,73 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                                                   StrideAs,
                                                                                   StrideBs,
                                                                                   StrideCs,
-                                                                                   kbatch,
+                                                                                   kbatches,
+                                                                                   n_warmup,
+                                                                                   n_iter);
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                float,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs,
+                                                                                   kbatches,
+                                                                                   n_warmup,
+                                                                                   n_iter);
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                float,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs,
+                                                                                   kbatches,
+                                                                                   n_warmup,
+                                                                                   n_iter);
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                ck::bhalf_t,
+                                                float,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs,
+                                                                                   kbatches,
                                                                                   n_warmup,
                                                                                   n_iter);
    }
@@ -239,7 +301,6 @@ int profile_grouped_gemm(int argc, char* argv[])
    {
        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
    }
-#endif
    return 0;
 }


--- a/profiler/src/profile_grouped_gemm_fixed_nk.cpp
+++ b/profiler/src/profile_grouped_gemm_fixed_nk.cpp
@@ -32,9 +32,7 @@ namespace {
 std::vector<int> argToIntArray(char* input)
 {
    std::vector<int> out;
-
    std::istringstream in(input);
-
    std::string item;

    while(std::getline(in, item, ','))
@@ -83,7 +81,7 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
    const auto StrideAs = argToIntArray(argv[11]);
    const auto StrideBs = argToIntArray(argv[12]);
    const auto StrideCs = argToIntArray(argv[13]);
-    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;
+    const int kbatch    = argc >= 15 ? std::stoi(argv[14]) : 1;

    using F32 = float;
    using F16 = ck::half_t;
@@ -97,8 +95,8 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
    int n_iter   = 10;
    if(argc == 17)
    {
-        n_warmup = std::stoi(argv[16]);
-        n_iter   = std::stoi(argv[17]);
+        n_warmup = std::stoi(argv[15]);
+        n_iter   = std::stoi(argv[16]);
    }

 #if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)

--- a/profiler/src/profile_grouped_gemm_two_stage.cpp
+++ b/profiler/src/profile_grouped_gemm_two_stage.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-
-#include "profiler/profile_grouped_gemm_two_stage_impl.hpp"
-#include "profiler_operation_registry.hpp"
-
-enum struct GemmMatrixLayout
-{
-    MK_KN_MN, // 0
-    MK_NK_MN, // 1
-};
-
-enum struct GemmDataType
-{
-    F16_F16_F16,    // 0
-    BF16_INT8_BF16, // 1
-    BF16_BF16_BF16  // 2
-};
-
-#define OP_NAME "grouped_gemm_two_stage"
-#define OP_DESC "Grouped GEMM TwoStage"
-
-namespace {
-
-std::vector<int> argToIntArray(char* input)
-{
-    std::vector<int> out;
-
-    std::istringstream in(input);
-
-    std::string item;
-
-    while(std::getline(in, item, ','))
-    {
-        out.push_back(std::stoi(item));
-    }
-
-    return out;
-}
-
-int profile_grouped_gemm_two_stage(int argc, char* argv[])
-{
-    if(argc < 14)
-    {
-        std::cout
-            << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
-            << "arg2: data type (0: fp16; 1: bf16@int8; 2: bf16)\n"
-            << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n]);\n"
-            << "arg4: verification (0: no; 1: yes)\n"
-            << "arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"
-            << "arg6: print tensor value (0: no; 1: yes)\n"
-            << "arg7: time kernel (0=n0, 1=yes)\n"
-            << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
-               "64,64 64,64 128,128)\n"
-            << "arg15: kbatch value (default 1)\n"
-            << "optional:\n"
-            << "arg16: number of warm-up cycles (default 1)\n"
-            << "arg17: number of iterations (default 10)\n"
-            << std::endl;
-
-        exit(1);
-    }
-
-    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
-    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
-    const bool do_verification = std::stoi(argv[4]);
-    const int init_method      = std::stoi(argv[5]);
-    const bool do_log          = std::stoi(argv[6]);
-    const bool time_kernel     = std::stoi(argv[7]);
-
-    const auto Ms = argToIntArray(argv[8]);
-    const auto Ns = argToIntArray(argv[9]);
-    const auto Ks = argToIntArray(argv[10]);
-
-    auto StrideAs    = argToIntArray(argv[11]);
-    auto StrideBs    = argToIntArray(argv[12]);
-    auto StrideCs    = argToIntArray(argv[13]);
-    const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1;
-
-    const int DefaultStrideA = Ks[0];
-    const int DefaultStrideB = Ns[0];
-    const int DefaultStrideC = Ns[0];
-
-    for(size_t i = 0; i < Ms.size(); ++i)
-    {
-        StrideAs[i] = StrideAs[i] == -1 ? DefaultStrideA : StrideAs[i];
-        StrideBs[i] = StrideBs[i] == -1 ? DefaultStrideB : StrideBs[i];
-        StrideCs[i] = StrideCs[i] == -1 ? DefaultStrideC : StrideCs[i];
-    }
-
-    int n_warmup = 1;
-    int n_iter   = 10;
-    if(argc == 17)
-    {
-        n_warmup = std::stoi(argv[16]);
-        n_iter   = std::stoi(argv[17]);
-    }
-
-    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::half_t,
-                                                          ck::half_t,
-                                                          ck::half_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else if(data_type == GemmDataType::BF16_INT8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::bhalf_t,
-                                                          int8_t,
-                                                          ck::bhalf_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else if(data_type == GemmDataType::BF16_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::bhalf_t,
-                                                          int8_t,
-                                                          ck::bhalf_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::ColumnMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::bhalf_t,
-                                                          ck::bhalf_t,
-                                                          ck::bhalf_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
-    {
-        ck::profiler::profile_grouped_gemm_two_stage_impl<ck::bhalf_t,
-                                                          ck::bhalf_t,
-                                                          ck::bhalf_t,
-                                                          float,
-                                                          ck::tensor_layout::gemm::RowMajor,
-                                                          ck::tensor_layout::gemm::ColumnMajor,
-                                                          ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else
-    {
-        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
-    }
-    return 0;
-}
-
-} // anonymous namespace
-
-REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm_two_stage);
--- a/python/ck4inductor/batched_universal_gemm/gen_instances.py
+++ b/python/ck4inductor/batched_universal_gemm/gen_instances.py
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+import logging
+import os
+import subprocess
+from dataclasses import replace
+from functools import lru_cache
+from typing import List
+
+from ..util import library_path
+
+from .op import CKBatchedGemmOperation
+
+log = logging.getLogger(__name__)
+
+
+def _ck_library_dir():
+    gemm_instances_path = os.path.join(
+        library_path(),
+        "src",
+        "tensor_operation_instance",
+        "gpu",
+        "gemm_universal_batched",
+    )
+    if not os.path.exists(gemm_instances_path):
+        log.error("CK library path %s does not exist", gemm_instances_path)
+        return None
+    return gemm_instances_path
+
+
+def parse_instances(str_instances: List[str]) -> List[CKBatchedGemmOperation]:
+    """
+    Parse the lines containing Universal Gemm template instances into `CKBatchedGemmOperation` instances
+    """
+
+    def maybe_int(s):
+        try:
+            return int(s)
+        except ValueError:
+            return s
+
+    op_instances = []
+    for line in str_instances:
+        s_template_args = line.split("DeviceBatchedGemmMultiD_Xdl_CShuffle_V3")[
+            -1
+        ].strip("<>, ")
+        template_args = []
+        i_current = 0
+        while i_current < len(s_template_args):
+            if s_template_args[i_current] == " ":
+                # skip whitespace
+                i_current += 1
+                continue
+            elif s_template_args[i_current : i_current + 2] == "S<":
+                # parse template S<Index...>
+                i_next = s_template_args.find(">", i_current)
+                template_args.append(
+                    tuple(map(int, s_template_args[i_current + 2 : i_next].split(",")))
+                )
+                i_current = i_next + 2
+            else:
+                # all string attributes must be either type aliases or global constants in C++
+                i_next = s_template_args.find(",", i_current)
+                template_args.append(
+                    maybe_int(
+                        s_template_args[i_current : i_next if i_next != -1 else None]
+                    )
+                )
+                if i_next != -1:
+                    i_current = i_next + 1
+            if i_next == -1:
+                break
+
+        # ds layout and dtype are parsed as placeholder; reset value
+        template_args[2] = tuple()  # ds layout
+        template_args[6] = tuple()  # ds dtype
+
+        new_instance = CKBatchedGemmOperation(
+            *template_args,  # type: ignore[arg-type]
+        )
+
+        op_instances.append(new_instance)
+    return op_instances
+
+
+@lru_cache(None)
+def gen_ops_library() -> List[CKBatchedGemmOperation]:
+    """
+    Parse the Universal Gemm instances defined in the composable kernel library folder.
+    """
+    ck_library_dir = _ck_library_dir()
+    if not ck_library_dir:
+        return []
+
+    grep_result = subprocess.run(
+        [
+            "grep",
+            "-inR",
+            "DeviceBatchedGemmMultiD_Xdl_CShuffle_V3",
+            _ck_library_dir(),
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    op_instances = parse_instances(grep_result.stdout.strip().split("\n"))
+
+    log.debug("ck instances from library: %d", len(op_instances))
+
+    schedulers = [
+        "BlockGemmPipelineScheduler::Intrawave",
+        "BlockGemmPipelineScheduler::Interwave",
+    ]
+    gemm_specs = [
+        "GemmSpecialization::Default",
+        "GemmSpecialization::MPadding",
+        "GemmSpecialization::NPadding",
+        "GemmSpecialization::KPadding",
+        "GemmSpecialization::MNPadding",
+        "GemmSpecialization::MKPadding",
+        "GemmSpecialization::NKPadding",
+        "GemmSpecialization::MNKPadding",
+    ]
+
+    # substitute templated args by looping through their domains
+    substitute_instances = []
+    for instance in op_instances:
+        sub_scheduler = instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched"
+        sub_spec = instance.gemm_specialization == "GemmSpec"
+        schedulers_range = (
+            schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler]
+        )
+        spec_range = gemm_specs if sub_spec else [instance.gemm_specialization]
+        for scheduler in schedulers_range:
+            for spec in spec_range:
+                substitute_instances.append(
+                    replace(
+                        instance,
+                        block_gemm_pipeline_scheduler=scheduler,
+                        gemm_specialization=spec,
+                    )
+                )
+
+    return substitute_instances
+
+
+if __name__ == "__main__":
+    print(gen_ops_library())
--- a/python/ck4inductor/batched_universal_gemm/op.py
+++ b/python/ck4inductor/batched_universal_gemm/op.py
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+from dataclasses import asdict, dataclass
+from typing import Optional, Tuple
+
+
+@dataclass
+class CKBatchedGemmOperation:
+    """
+    A python dataclass storing the template parameters of a CK Universal Gemm template instance
+    """
+
+    a_layout: str
+    b_layout: str
+    ds_layouts: Tuple[str]  # addmm specific
+    c_layout: str
+
+    a_element_dtype: str
+    b_element_dtype: str
+    ds_element_dtypes: Tuple[str]  # addmm specific
+    c_element_dtype: str
+
+    acc_dtype: str
+    c_shuffle_dtype: str
+
+    a_elementwise_op: str
+    b_elementwise_op: str
+    c_elementwise_op: str
+
+    gemm_specialization: str
+
+    block_size: int
+
+    m_per_block: int
+    n_per_block: int
+    k_per_block: int
+
+    a_k1: int
+    b_k1: int
+
+    m_per_xdl: int
+    n_per_xdl: int
+
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+
+    a_block_transfer_thread_cluster_lengths_ak0_m_ak1: Tuple[int, int, int]
+    a_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int]
+    a_block_transfer_src_access_order: Tuple[int, int, int]
+    a_block_transfer_src_vector_dim: int
+    a_block_transfer_src_scalar_per_vector: int
+    a_block_transfer_dst_scalar_per_vector_ak1: int
+    a_block_lds_extra_m: bool
+
+    b_block_transfer_thread_cluster_lengths_bk0_n_bk1: Tuple[int, int, int]
+    b_block_transfer_thread_cluster_arrange_order: Tuple[int, int, int]
+    b_block_transfer_src_access_order: Tuple[int, int, int]
+
+    b_block_transfer_src_vector_dim: int
+    b_block_transfer_src_scalar_per_vector: int
+    b_block_transfer_dst_scalar_per_vector_bk1: int
+    b_block_lds_extra_n: bool
+
+    c_shuffle_m_xdl_per_wave_per_shuffle: int
+    c_shuffle_n_xdl_per_wave_per_shuffle: int
+
+    c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block: (
+        Tuple[int, int, int, int]
+    )
+    c_shuffle_block_transfer_scalar_per_vector_n_per_block: Tuple[int]
+    block_gemm_pipeline_scheduler: str
+    block_gemm_pipeline_version: str
+
+    a_compute_dtype: Optional[str] = None
+    b_compute_dtype: Optional[str] = None
+
+    def name(self):
+        # cpp alias for template instance
+        return f"ck_device_batched_gemm_multi_d_xdl_c_shuffle_v3_{self.key_name()}"
+
+    def key_name(self):
+        # TBD; must be unique per instance. Intended to use as dict key
+        return "_".join(
+            [
+                "K"
+                + field_name.replace("_", "").lower()
+                + "V"
+                + (
+                    "x".join(map(str, iter(field_value)))
+                    if isinstance(field_value, tuple)
+                    else str(field_value).replace(":", "")
+                )
+                for field_name, field_value in self.dict_items()
+            ]
+        )
+
+    def dict_items(self):
+        return asdict(self).items()
--- a/python/ck4inductor/grouped_conv_fwd/gen_instances.py
+++ b/python/ck4inductor/grouped_conv_fwd/gen_instances.py
@@ -130,9 +130,7 @@ def gen_conv_ops_library() -> List[CKGroupedConvFwdOp]:
    # substitute templated args by looping through their domains
    substitute_instances = []
    for instance in op_instances:
-        sub_scheduler = (
-            instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched"
-        )
+        sub_scheduler = instance.block_gemm_pipeline_scheduler == "BlkGemmPipeSched"
        sub_spec = instance.conv_forward_specialization == "ConvSpec"
        schedulers_range = (
            schedulers if sub_scheduler else [instance.block_gemm_pipeline_scheduler]

--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
 add_subdirectory(image_to_column)
 add_subdirectory(gemm)
+add_subdirectory(batched_gemm)
--- a/test/ck_tile/batched_gemm/CMakeLists.txt
+++ b/test/ck_tile/batched_gemm/CMakeLists.txt
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_batched_gemm test_batched_gemm.cpp)
+endif()
--- a/test/ck_tile/batched_gemm/test_batched_gemm.cpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+#include "test_batched_gemm_util.hpp"
+
+using F16 = ck_tile::half_t;
+using F32 = float;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16>,
+    //std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16>//,
+    //std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestCkTileBatchedGemm, KernelTypes);
+
+#include "test_batched_gemm_ut_cases.inc"
--- a/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_ut_cases.inc
+#pragma once
+
+TYPED_TEST(TestCkTileBatchedGemm, Basic)
+{
+    constexpr int M = 256;
+    constexpr int N = 128;
+    constexpr int K = 128;
+    this->Run(M, N, K);
+}