Merge remote-tracking branch 'origin/develop' into migx-jit-lib

ac580f77 · Alan Turner · 707d6261 · 027e46ee · ac580f77 · ac580f77
Commit ac580f77 authored Jun 16, 2023 by Alan Turner
20 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instance.cpp
@@ -23,7 +23,7 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
-using KZYXGC = ck::tensor_layout::convolution::KZYXGC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
 using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -40,71 +40,71 @@ static constexpr auto ConvFwd1x1S1P0 =
 static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 // in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
-using device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances = std::tuple<
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances = std::tuple<
    // clang-format off
        // Default
        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
        // Filter1x1Pad0
        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
        // Filter1x1Stride1Pad0                                                                                                                                                                                 
        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, KZYXGC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, NDHWGC, GKZYXC, Empty_Tuple, NDHWGK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
    // clang-format on
    >;
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances(
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
                                                              NDHWGC,
-                                                              KZYXGC,
+                                                              GKZYXC,
                                                              Empty_Tuple,
                                                              NDHWGK,
                                                              int8_t,
@@ -116,7 +116,7 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances(
                                                              PassThrough>>>& instances)
 {
    add_device_operation_instances(
-        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_kzyxgc_ndhwgk_int8_instances{});
+        instances, device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances{});
 }
 } // namespace instance

--- a/profiler/README.md
+++ b/profiler/README.md
@@ -76,3 +76,30 @@ e_m_n: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1}
 ....
 Best Perf: 211.405 ms, 41.6077 TFlops, 15.2372 GB/s
 ```
+## Profile batched gemm multiple D kernels
+```bash
+#arg1: tensor operation (batched_gemm_multi_d=Batched GEMM multi D);
+#arg2: data type (0: fp16; 1: int8)
+#arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];
+#                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];
+#                     2: A[g, k, m] * B[g, k, n] = C[g, m, n];
+#                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])
+#arg4: verification (0: no; 1: yes)
+#arg5: initialization (0: no init; 1: integer value; 2: decimal value)
+#arg6: print tensor value (0: no; 1: yes)
+#arg7: time kernel (0=n0, 1=yes)
+#arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount
+################                   op  datatype  layout  verify  init  log  time    M    N    K StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+./bin/ckProfiler batched_gemm_multi_d         0       1       0     0    0     1 4096 4096 4096    4096    4096    4096     16777216     16777216     16777216         16
+```
+Result (Radeon RX 6800 XT)
+```bash
+arg.a_grid_desc_k0_m0_m1_k1_{2048, 4096, 2}
+arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
+arg.e_grid_desc_m_n_{ 4096, 4096}
+....
+Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
+```
--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -8,9 +8,11 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -27,7 +29,11 @@ template <typename ADataType,
          typename CDataType,
          typename ALayout,
          typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CElementOp,
+          typename DeviceOp>
 bool profile_batched_gemm_impl(int do_verification,
                               int init_method,
                               bool do_log,
@@ -88,10 +94,6 @@ bool profile_batched_gemm_impl(int do_verification,
        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
    }
-    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
    const auto a_element_op = AElementOp{};
    const auto b_element_op = BElementOp{};
    const auto c_element_op = CElementOp{};
@@ -124,16 +126,6 @@ bool profile_batched_gemm_impl(int do_verification,
    b_device_buf.ToDevice(b_g_k_n.mData.data());
    c_device_buf.ToDevice(c_g_m_n_device_result.mData.data());
-    using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
-                                                                     BLayout,
-                                                                     CLayout,
-                                                                     ADataType,
-                                                                     BDataType,
-                                                                     CDataType,
-                                                                     AElementOp,
-                                                                     BElementOp,
-                                                                     CElementOp>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();
@@ -148,23 +140,62 @@ bool profile_batched_gemm_impl(int do_verification,
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
+        std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+        // false branch for multi d dl kernel
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+        if constexpr(std::is_same<
-                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                         DeviceOp,
-                                        M,
+                         ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
-                                        N,
+                                                                         BLayout,
-                                        K,
+                                                                         CLayout,
-                                        StrideA,
+                                                                         ADataType,
-                                        StrideB,
+                                                                         BDataType,
-                                        StrideC,
+                                                                         CDataType,
-                                        BatchStrideA,
+                                                                         AElementOp,
-                                        BatchStrideB,
+                                                                         BElementOp,
-                                        BatchStrideC,
+                                                                         CElementOp>>::value)
-                                        BatchCount,
+        {
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{},
+            argument_ptr =
-                                        ck::tensor_operation::element_wise::PassThrough{});
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            StrideA,
+                                            StrideB,
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            BatchStrideC,
+                                            BatchCount,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{});
+        }
+        else
+        {
+            argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            {},
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            BatchCount,
+                                            StrideA,
+                                            StrideB,
+                                            {},
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            {},
+                                            BatchStrideC,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{});
+        }
        auto invoker_ptr = op_ptr->MakeInvokerPointer();

--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -34,6 +34,7 @@ set(PROFILER_SOURCES
    profile_grouped_gemm_fastgelu.cpp
    profile_contraction_bilinear.cpp
    profile_contraction_scale.cpp
+    profile_batched_gemm_multi_d.cpp
 )
 set(PROFILER_EXECUTABLE ckProfiler)
@@ -77,5 +78,5 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgel
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -10,6 +10,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
 #include "profiler_operation_registry.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 enum struct GemmMatrixLayout
 {
    MK_KN_MN, // 0
@@ -78,55 +80,72 @@ int profile_batched_gemm(int argc, char* argv[])
    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;
-    auto profile = [&](auto a_type,
+    auto profile =
-                       auto b_type,
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
-                       auto c_type,
+            using ADataType = decltype(a_type);
-                       auto a_layout,
+            using BDataType = decltype(b_type);
-                       auto b_layout,
+            using CDataType = decltype(c_type);
-                       auto c_layout) {
-        using ADataType = decltype(a_type);
+            using ALayout = decltype(a_layout);
-        using BDataType = decltype(b_type);
+            using BLayout = decltype(b_layout);
-        using CDataType = decltype(c_type);
+            using CLayout = decltype(c_layout);
-        using ALayout = decltype(a_layout);
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
-        using BLayout = decltype(b_layout);
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
-        using CLayout = decltype(c_layout);
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
-        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
-        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
-        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+            const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
-        const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+            const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
-        const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+            const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
-        const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+            const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
-        const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+            const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
-        const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+            const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
-        const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+            const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
-        const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+            using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-        const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+            using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-        const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+            using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-        bool pass = ck::profiler::
+            using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
-            profile_batched_gemm_impl<ADataType, BDataType, CDataType, ALayout, BLayout, CLayout>(
+                                                                             BLayout,
-                do_verification,
+                                                                             CLayout,
-                init_method,
+                                                                             ADataType,
-                do_log,
+                                                                             BDataType,
-                time_kernel,
+                                                                             CDataType,
-                M,
+                                                                             AElementOp,
-                N,
+                                                                             BElementOp,
-                K,
+                                                                             CElementOp>;
-                BatchStrideA_,
-                BatchStrideB_,
+            bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
-                BatchStrideC_,
+                                                                BDataType,
-                StrideA_,
+                                                                CDataType,
-                StrideB_,
+                                                                ALayout,
-                StrideC_,
+                                                                BLayout,
-                BatchCount);
+                                                                CLayout,
+                                                                AElementOp,
-        return pass ? 0 : 1;
+                                                                BElementOp,
-    };
+                                                                CElementOp,
+                                                                DeviceOp>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
+                                                                          StrideA_,
+                                                                          StrideB_,
+                                                                          StrideC_,
+                                                                          BatchCount);
+            return pass ? 0 : 1;
+        };
    if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
    {

--- a/profiler/src/profile_batched_gemm_multi_d.cpp
+++ b/profiler/src/profile_batched_gemm_multi_d.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+enum struct GemmDataType
+{
+    F16_F16_F16,    // 0
+    INT8_INT8_INT8, // 1
+};
+#define OP_NAME "batched_gemm_multi_d"
+#define OP_DESC "Batched GEMM multi D"
+int profile_batched_gemm_multi_d(int argc, char* argv[])
+{
+    if(argc != 18)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp16; 1: int8)\n");
+        printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
+        printf("                     2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
+        // clang-format on
+        exit(1);
+    }
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+    const int BatchStrideA = std::stoi(argv[14]);
+    const int BatchStrideB = std::stoi(argv[15]);
+    const int BatchStrideC = std::stoi(argv[16]);
+    const int BatchCount = std::stoi(argv[17]);
+    using F16  = ck::half_t;
+    using INT8 = int8_t;
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+    auto profile =
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
+            using ADataType  = decltype(a_type);
+            using BDataType  = decltype(b_type);
+            using CDataType  = decltype(c_type);
+            using DsDataType = ck::Tuple<>;
+            using ALayout  = decltype(a_layout);
+            using BLayout  = decltype(b_layout);
+            using CLayout  = decltype(c_layout);
+            using DsLayout = ck::Tuple<>;
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+            const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+            const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+            const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+            const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+            const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+            const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+            const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+            const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+            const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+            using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemmMultiD<ALayout,
+                                                                                   BLayout,
+                                                                                   DsLayout,
+                                                                                   CLayout,
+                                                                                   ADataType,
+                                                                                   BDataType,
+                                                                                   DsDataType,
+                                                                                   CDataType,
+                                                                                   AElementOp,
+                                                                                   BElementOp,
+                                                                                   CElementOp>;
+            bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                ALayout,
+                                                                BLayout,
+                                                                CLayout,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp,
+                                                                DeviceOp>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
+                                                                          StrideA_,
+                                                                          StrideB_,
+                                                                          StrideC_,
+                                                                          BatchCount);
+            return pass ? 0 : 1;
+        };
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+        return 1;
+    }
+}
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_multi_d);
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -12,7 +12,7 @@ cmake
 -save-temps=$PWD"                                                                                 \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
   target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
   target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
@@ -14,4 +17,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
   add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
   target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
   target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -5,6 +5,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 namespace {
 using ADataType = ck::bhalf_t;
 using BDataType = ck::bhalf_t;
@@ -12,6 +14,8 @@ using CDataType = ck::bhalf_t;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 int main()
@@ -23,21 +27,87 @@ int main()
    bool pass = true;
-    pass = pass &&
+    using namespace ck::tensor_operation::device;
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
    std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -5,6 +5,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 namespace {
 using ADataType = ck::half_t;
 using BDataType = ck::half_t;
@@ -12,6 +14,8 @@ using CDataType = ck::half_t;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 int main()
@@ -23,21 +27,87 @@ int main()
    bool pass = true;
-    pass = pass &&
+    using namespace ck::tensor_operation::device;
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
    std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -5,6 +5,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 namespace {
 using ADataType = float;
 using BDataType = float;
@@ -12,6 +14,8 @@ using CDataType = float;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 int main()
@@ -23,21 +27,87 @@ int main()
    bool pass = true;
-    pass = pass &&
+    using namespace ck::tensor_operation::device;
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
    std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -5,6 +5,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 namespace {
 using ADataType = int8_t;
 using BDataType = int8_t;
@@ -12,6 +14,8 @@ using CDataType = int8_t;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 int main()
@@ -23,21 +27,87 @@ int main()
    bool pass = true;
-    pass = pass &&
+    using namespace ck::tensor_operation::device;
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
    std::cout << "test BatchedGEMM int8: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_batched_gemm_gemm)
   add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
   target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
   add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/batched_gemm_multi_d/CMakeLists.txt
+++ b/test/batched_gemm_multi_d/CMakeLists.txt
+# TODO: Enable for gfx90a after complier fix
+if(NOT GPU_TARGETS MATCHES "gfx90a")
+    add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
+    target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
+endif()
--- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
+++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <gtest/gtest.h>
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
+namespace {
+using F16 = ck::half_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Empty_Tuple = ck::Tuple<>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+template <typename Tuple>
+class TestBatchedGemmMultiD : public ::testing::Test
+{
+    protected:
+    using ALayout = std::tuple_element_t<0, Tuple>;
+    using BLayout = std::tuple_element_t<1, Tuple>;
+    using CLayout = std::tuple_element_t<2, Tuple>;
+    static constexpr int M          = 512;
+    static constexpr int N          = 256;
+    static constexpr int K          = 128;
+    static constexpr int BatchCount = 3;
+    template <typename DataType>
+    void Run()
+    {
+        using namespace ck::tensor_operation::device;
+        const bool pass =
+            ck::profiler::profile_batched_gemm_impl<DataType,
+                                                    DataType,
+                                                    DataType,
+                                                    ALayout,
+                                                    BLayout,
+                                                    CLayout,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    DeviceBatchedGemmMultiD<ALayout,
+                                                                            BLayout,
+                                                                            Empty_Tuple,
+                                                                            CLayout,
+                                                                            DataType,
+                                                                            DataType,
+                                                                            Empty_Tuple,
+                                                                            DataType,
+                                                                            PassThrough,
+                                                                            PassThrough,
+                                                                            PassThrough>>(
+                true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+        EXPECT_TRUE(pass);
+    }
+};
+using KernelTypes = ::testing::Types<std::tuple<Row, Row, Row>,
+                                     std::tuple<Row, Col, Row>,
+                                     std::tuple<Col, Row, Row>,
+                                     std::tuple<Col, Col, Row>>;
+} // namespace
+TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes);
+TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
+TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }
--- a/test/batched_gemm_reduce/CMakeLists.txt
+++ b/test/batched_gemm_reduce/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_batched_gemm_softmax_gemm)
   add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
   target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
   add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_batched_gemm_softmax_gemm_permute)
   add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
@@ -14,4 +17,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
   target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/contraction/CMakeLists.txt
+++ b/test/contraction/CMakeLists.txt
 add_gtest_executable(test_contraction test_contraction.cpp)
 target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
    target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()