Merge branch 'develop' into grouped_conv_3d_layout_fix

832b69cb · zjing14 · GitHub · 53130727 · a35456a3 · 832b69cb
Unverified Commit 832b69cb authored Jun 14, 2023 by zjing14 Committed by GitHub Jun 14, 2023
20 changed files
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<2, 1, 4, 4>,        S<4, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,      S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Col,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<1, 1, 4, 2>,       S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Col,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<16, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,        S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 8, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,       S<4, 1, 8, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,        S<4, 1, 4, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,       S<32, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Row, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,        S<1, 1, 8, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<4, 1, 1, 2>,      S<0, 3, 1, 2>,        S<1, 1, 1, 2>,      S<1, 1, 4, 2>,        S<4, 1, 2, 1>,   S<0, 3, 1, 2>,  S<0, 3, 1, 2>,      S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Row,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=16, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    16,    64,    16,  2,          1,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 1, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>,
+        // MPerBlock=64, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    16,    16,  2,          4,          1,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<4, 1>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<4, 1>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  2>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,    GemmDefault,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Empty_Tuple = ck::Tuple<>;
+
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances = std::tuple<
+    // clang-format off
+        // ##########################| ALayout| BLayout|    DsLayout| CLayout|  AData|  BData| AccData|      DsData| CData|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // ##########################|        |        |            |        |   Type|   Type|    Type|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // ##########################|        |        |            |        |       |       |        |            |      |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // ##########################|        |        |            |        |       |       |        |            |      |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        // MPerBlock=128, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<8, 2>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<4, 4>,       S<4, 2>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   128,   128,   128,    16,  4,          4,          8,      1,       S<2, 8>,       S<2, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 2, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // // MPerBlock=128, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,   128,    64,    16,  4,          4,          2,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // // MPerBlock=64, NPerBlock=128
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<4, 4>,       S<4, 4>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   256,    64,   128,    16,  4,          2,          4,      1,       S<2, 8>,       S<2, 8>,      S<8, 1, 1, 4>,       S<2, 1, 64, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<2, 4>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<8, 1>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,    64,     8,  4,          4,          4,      1,       S<4, 2>,       S<8, 1>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 32, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=32, NPerBlock=32
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          2,          4,      1,       S<4, 2>,       S<2, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<4, 2>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    32,    32,    32,     8,  4,          4,          2,      1,       S<2, 2>,       S<2, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<4, 1, 2, 4>,       S<2, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=16, NPerBlock=16
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<2, 2>,       S<2, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    16,    16,    16,    16,  2,          2,          2,      1,       S<4, 1>,       S<4, 1>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 4, 2>,       S<4, 1, 4, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=64
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,     8,    64,    32,  2,          1,          2,      1,       S<2, 2>,       S<8, 2>,      S<4, 1, 1, 2>,        S<8, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 4, 2>,      S<4, 1, 16, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=64, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,    64,    64,     8,    32,  2,          2,          1,      1,       S<8, 2>,       S<2, 2>,      S<8, 1, 4, 2>,       S<4, 1, 16, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<8, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        // MPerBlock=8, NPerBlock=8
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<4, 1>,       S<2, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          1,          2,      1,       S<1, 4>,       S<1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<2, 1>,       S<4, 1>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>,
+        DeviceBatchedGemmMultipleD_Dl<     Row,     Col, Empty_Tuple,     Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,  GemmMNPadding,     8,     8,     8,     4,  2,          2,          1,      1,       S<1, 2>,       S<1, 4>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,        S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<4, 1, 1, 2>,       S<1, 1, 8, 1>,   S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultiD<Row,
+                                                        Col,
+                                                        Empty_Tuple,
+                                                        Row,
+                                                        int8_t,
+                                                        int8_t,
+                                                        Empty_Tuple,
+                                                        int8_t,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -41,10 +41,11 @@ template <index_t NumDimG,
 using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances =
    std::tuple<
        // clang-format off
-        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|  AData|  B0Data|  B1Data|  CData|     Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|
-        // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|             Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
-        // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
-        // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|  AData|  B0Data|  B1Data|  CData|     Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|  D0s Bias|
+        // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|             Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            | SrcScalar|
+        // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            | PerVector|
+        // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |          |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,        1>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
@@ -58,8 +59,9 @@ using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        // Padded fallback kernel
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,        1>,  
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
        // clang-format on
        >;


--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -41,10 +41,11 @@ template <index_t NumDimG,
 using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
    std::tuple<
        // clang-format off
-        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| AData| B0Data| B1Data| CData|    Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|
-        // #############################################|         |        |        |        |        |  Type|   Type|   Type|  Type|            Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
-        // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
-        // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| AData| B0Data| B1Data| CData|    Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|  D0s Bias|
+        // #############################################|         |        |        |        |        |  Type|   Type|   Type|  Type|            Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            | SrcScalar|
+        // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            | PerVector|
+        // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |          |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,       1>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
 #if CK_WORKAROUND_SWDEV_388832
@@ -60,6 +61,7 @@ using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        // Padded fallback kernel
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,       1>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
        // clang-format on

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -45,6 +45,7 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_
        // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|         Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
        // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
        // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
@@ -58,8 +59,7 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        // Padded fallback kernel
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
        // clang-format on
        >;


--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -45,6 +45,7 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_
        // #############################################|         |        |        |        |        |  Type|   Type|   Type|  Type|         Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
        // #############################################|         |        |        |        |        |      |       |       |      |             |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
        // #############################################|         |        |        |        |        |      |       |       |      |             |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
 #if CK_WORKAROUND_SWDEV_388832
@@ -60,8 +61,7 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
        // Padded fallback kernel
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
-        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
        // clang-format on
        >;


--- a/profiler/README.md
+++ b/profiler/README.md
@@ -76,3 +76,30 @@ e_m_n: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1}
 ....
 Best Perf: 211.405 ms, 41.6077 TFlops, 15.2372 GB/s
 ```
+
+## Profile batched gemm multiple D kernels
+```bash
+#arg1: tensor operation (batched_gemm_multi_d=Batched GEMM multi D);
+#arg2: data type (0: fp16; 1: int8)
+#arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];
+#                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];
+#                     2: A[g, k, m] * B[g, k, n] = C[g, m, n];
+#                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])
+#arg4: verification (0: no; 1: yes)
+#arg5: initialization (0: no init; 1: integer value; 2: decimal value)
+#arg6: print tensor value (0: no; 1: yes)
+#arg7: time kernel (0=n0, 1=yes)
+#arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount
+
+################                   op  datatype  layout  verify  init  log  time    M    N    K StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+./bin/ckProfiler batched_gemm_multi_d         0       1       0     0    0     1 4096 4096 4096    4096    4096    4096     16777216     16777216     16777216         16
+```
+
+Result (Radeon RX 6800 XT)
+```bash
+arg.a_grid_desc_k0_m0_m1_k1_{2048, 4096, 2}
+arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
+arg.e_grid_desc_m_n_{ 4096, 4096}
+....
+Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
+```
--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -8,9 +8,11 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -27,7 +29,11 @@ template <typename ADataType,
          typename CDataType,
          typename ALayout,
          typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CElementOp,
+          typename DeviceOp>
 bool profile_batched_gemm_impl(int do_verification,
                               int init_method,
                               bool do_log,
@@ -88,10 +94,6 @@ bool profile_batched_gemm_impl(int do_verification,
        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
    }

-    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-
    const auto a_element_op = AElementOp{};
    const auto b_element_op = BElementOp{};
    const auto c_element_op = CElementOp{};
@@ -124,16 +126,6 @@ bool profile_batched_gemm_impl(int do_verification,
    b_device_buf.ToDevice(b_g_k_n.mData.data());
    c_device_buf.ToDevice(c_g_m_n_device_result.mData.data());

-    using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
-                                                                     BLayout,
-                                                                     CLayout,
-                                                                     ADataType,
-                                                                     BDataType,
-                                                                     CDataType,
-                                                                     AElementOp,
-                                                                     BElementOp,
-                                                                     CElementOp>;
-
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();
@@ -148,23 +140,62 @@ bool profile_batched_gemm_impl(int do_verification,
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                        M,
-                                        N,
-                                        K,
-                                        StrideA,
-                                        StrideB,
-                                        StrideC,
-                                        BatchStrideA,
-                                        BatchStrideB,
-                                        BatchStrideC,
-                                        BatchCount,
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{});
+        std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
+        // false branch for multi d dl kernel
+        if constexpr(std::is_same<
+                         DeviceOp,
+                         ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
+                                                                         BLayout,
+                                                                         CLayout,
+                                                                         ADataType,
+                                                                         BDataType,
+                                                                         CDataType,
+                                                                         AElementOp,
+                                                                         BElementOp,
+                                                                         CElementOp>>::value)
+        {
+
+            argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            StrideA,
+                                            StrideB,
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            BatchStrideC,
+                                            BatchCount,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{});
+        }
+        else
+        {
+            argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            {},
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            BatchCount,
+                                            StrideA,
+                                            StrideB,
+                                            {},
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            {},
+                                            BatchStrideC,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{});
+        }

        auto invoker_ptr = op_ptr->MakeInvokerPointer();


--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -34,6 +34,7 @@ set(PROFILER_SOURCES
    profile_grouped_gemm_fastgelu.cpp
    profile_contraction_bilinear.cpp
    profile_contraction_scale.cpp
+    profile_batched_gemm_multi_d.cpp
 )

 set(PROFILER_EXECUTABLE ckProfiler)
@@ -77,5 +78,5 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgel
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
-
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -10,6 +10,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
 #include "profiler_operation_registry.hpp"

+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 enum struct GemmMatrixLayout
 {
    MK_KN_MN, // 0
@@ -78,55 +80,72 @@ int profile_batched_gemm(int argc, char* argv[])
    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;

-    auto profile = [&](auto a_type,
-                       auto b_type,
-                       auto c_type,
-                       auto a_layout,
-                       auto b_layout,
-                       auto c_layout) {
-        using ADataType = decltype(a_type);
-        using BDataType = decltype(b_type);
-        using CDataType = decltype(c_type);
-
-        using ALayout = decltype(a_layout);
-        using BLayout = decltype(b_layout);
-        using CLayout = decltype(c_layout);
-
-        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
-        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
-        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
-
-        const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
-        const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
-        const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
-
-        const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
-        const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
-        const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
-
-        const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
-        const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
-        const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
-
-        bool pass = ck::profiler::
-            profile_batched_gemm_impl<ADataType, BDataType, CDataType, ALayout, BLayout, CLayout>(
-                do_verification,
-                init_method,
-                do_log,
-                time_kernel,
-                M,
-                N,
-                K,
-                BatchStrideA_,
-                BatchStrideB_,
-                BatchStrideC_,
-                StrideA_,
-                StrideB_,
-                StrideC_,
-                BatchCount);
-
-        return pass ? 0 : 1;
-    };
+    auto profile =
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
+            using ADataType = decltype(a_type);
+            using BDataType = decltype(b_type);
+            using CDataType = decltype(c_type);
+
+            using ALayout = decltype(a_layout);
+            using BLayout = decltype(b_layout);
+            using CLayout = decltype(c_layout);
+
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+            const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+            const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+            const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+            const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+            const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+            const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+
+            const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+            const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+            const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+
+            using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+            using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
+                                                                             BLayout,
+                                                                             CLayout,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             CElementOp>;
+
+            bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                ALayout,
+                                                                BLayout,
+                                                                CLayout,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp,
+                                                                DeviceOp>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
+                                                                          StrideA_,
+                                                                          StrideB_,
+                                                                          StrideC_,
+                                                                          BatchCount);
+
+            return pass ? 0 : 1;
+        };

    if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
    {

--- a/profiler/src/profile_batched_gemm_multi_d.cpp
+++ b/profiler/src/profile_batched_gemm_multi_d.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F16_F16_F16,    // 0
+    INT8_INT8_INT8, // 1
+};
+
+#define OP_NAME "batched_gemm_multi_d"
+#define OP_DESC "Batched GEMM multi D"
+
+int profile_batched_gemm_multi_d(int argc, char* argv[])
+{
+    if(argc != 18)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp16; 1: int8)\n");
+        printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
+        printf("                     2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+
+    const int BatchStrideA = std::stoi(argv[14]);
+    const int BatchStrideB = std::stoi(argv[15]);
+    const int BatchStrideC = std::stoi(argv[16]);
+
+    const int BatchCount = std::stoi(argv[17]);
+
+    using F16  = ck::half_t;
+    using INT8 = int8_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile =
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
+            using ADataType  = decltype(a_type);
+            using BDataType  = decltype(b_type);
+            using CDataType  = decltype(c_type);
+            using DsDataType = ck::Tuple<>;
+
+            using ALayout  = decltype(a_layout);
+            using BLayout  = decltype(b_layout);
+            using CLayout  = decltype(c_layout);
+            using DsLayout = ck::Tuple<>;
+
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+            const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+            const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+            const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+            const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+            const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+            const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+
+            const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+            const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+            const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+
+            using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+            using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemmMultiD<ALayout,
+                                                                                   BLayout,
+                                                                                   DsLayout,
+                                                                                   CLayout,
+                                                                                   ADataType,
+                                                                                   BDataType,
+                                                                                   DsDataType,
+                                                                                   CDataType,
+                                                                                   AElementOp,
+                                                                                   BElementOp,
+                                                                                   CElementOp>;
+
+            bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                ALayout,
+                                                                BLayout,
+                                                                CLayout,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp,
+                                                                DeviceOp>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
+                                                                          StrideA_,
+                                                                          StrideB_,
+                                                                          StrideC_,
+                                                                          BatchCount);
+
+            return pass ? 0 : 1;
+        };
+
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_multi_d);
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -58,6 +58,7 @@ add_subdirectory(elementwise_normalization)
 add_subdirectory(batchnorm)
 add_subdirectory(contraction)
 add_subdirectory(pool_fwd)
+add_subdirectory(batched_gemm_multi_d)
 if(GPU_TARGETS MATCHES "gfx1100")
    add_subdirectory(wmma_op)
 endif()
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -5,6 +5,8 @@

 #include "profiler/profile_batched_gemm_impl.hpp"

+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 namespace {
 using ADataType = ck::bhalf_t;
 using BDataType = ck::bhalf_t;
@@ -12,6 +14,8 @@ using CDataType = ck::bhalf_t;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace

 int main()
@@ -23,21 +27,87 @@ int main()

    bool pass = true;

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    using namespace ck::tensor_operation::device;
+
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);

    std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -5,6 +5,8 @@

 #include "profiler/profile_batched_gemm_impl.hpp"

+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 namespace {
 using ADataType = ck::half_t;
 using BDataType = ck::half_t;
@@ -12,6 +14,8 @@ using CDataType = ck::half_t;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace

 int main()
@@ -23,21 +27,87 @@ int main()

    bool pass = true;

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    using namespace ck::tensor_operation::device;
+
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);

    std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -5,6 +5,8 @@

 #include "profiler/profile_batched_gemm_impl.hpp"

+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 namespace {
 using ADataType = float;
 using BDataType = float;
@@ -12,6 +14,8 @@ using CDataType = float;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace

 int main()
@@ -23,21 +27,87 @@ int main()

    bool pass = true;

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    using namespace ck::tensor_operation::device;
+
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);

    std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -5,6 +5,8 @@

 #include "profiler/profile_batched_gemm_impl.hpp"

+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
 namespace {
 using ADataType = int8_t;
 using BDataType = int8_t;
@@ -12,6 +14,8 @@ using CDataType = int8_t;

 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace

 int main()
@@ -23,21 +27,87 @@ int main()

    bool pass = true;

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    using namespace ck::tensor_operation::device;
+
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);

-    pass = pass &&
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);

    std::cout << "test BatchedGEMM int8: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm_multi_d/CMakeLists.txt
+++ b/test/batched_gemm_multi_d/CMakeLists.txt
+# TODO: Enable for gfx90a after complier fix
+if(NOT GPU_TARGETS MATCHES "gfx90a")
+    add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
+    target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
+endif()