Add DeviceBatchedGemmMultipleD_Dl (#732)

* Add DeviceBatchedGemmMultipleD_Dl * Fix batched_gemm tests * Fix comments * test_batched_gemm_multi_d fixes * Fix args for isSupported batchedGemmMultipleDDl * Disable tests for gfx90a

Add DeviceBatchedGemmMultipleD_Dl (#732)
* Add DeviceBatchedGemmMultipleD_Dl * Fix batched_gemm tests * Fix comments * test_batched_gemm_multi_d fixes * Fix args for isSupported batchedGemmMultipleDDl * Disable tests for gfx90a
fc9f9756 · Bartłomiej Kocot · GitHub · 7c24654c · fc9f9756 · fc9f9756
Unverified Commit fc9f9756 authored Jun 12, 2023 by Bartłomiej Kocot Committed by GitHub Jun 12, 2023
11 changed files
--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -8,9 +8,11 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -27,7 +29,11 @@ template <typename ADataType,
          typename CDataType,
          typename ALayout,
          typename BLayout,
-          typename CLayout>
+          typename CLayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CElementOp,
+          typename DeviceOp>
 bool profile_batched_gemm_impl(int do_verification,
                               int init_method,
                               bool do_log,
@@ -88,10 +94,6 @@ bool profile_batched_gemm_impl(int do_verification,
        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
    }
-    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
    const auto a_element_op = AElementOp{};
    const auto b_element_op = BElementOp{};
    const auto c_element_op = CElementOp{};
@@ -124,16 +126,6 @@ bool profile_batched_gemm_impl(int do_verification,
    b_device_buf.ToDevice(b_g_k_n.mData.data());
    c_device_buf.ToDevice(c_g_m_n_device_result.mData.data());
-    using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
-                                                                     BLayout,
-                                                                     CLayout,
-                                                                     ADataType,
-                                                                     BDataType,
-                                                                     CDataType,
-                                                                     AElementOp,
-                                                                     BElementOp,
-                                                                     CElementOp>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();
@@ -148,23 +140,62 @@ bool profile_batched_gemm_impl(int do_verification,
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
+        std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+        // false branch for multi d dl kernel
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+        if constexpr(std::is_same<
-                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                         DeviceOp,
-                                        M,
+                         ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
-                                        N,
+                                                                         BLayout,
-                                        K,
+                                                                         CLayout,
-                                        StrideA,
+                                                                         ADataType,
-                                        StrideB,
+                                                                         BDataType,
-                                        StrideC,
+                                                                         CDataType,
-                                        BatchStrideA,
+                                                                         AElementOp,
-                                        BatchStrideB,
+                                                                         BElementOp,
-                                        BatchStrideC,
+                                                                         CElementOp>>::value)
-                                        BatchCount,
+        {
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{},
+            argument_ptr =
-                                        ck::tensor_operation::element_wise::PassThrough{});
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            StrideA,
+                                            StrideB,
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            BatchStrideC,
+                                            BatchCount,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{});
+        }
+        else
+        {
+            argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            {},
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            BatchCount,
+                                            StrideA,
+                                            StrideB,
+                                            {},
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            {},
+                                            BatchStrideC,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{});
+        }
        auto invoker_ptr = op_ptr->MakeInvokerPointer();

--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -34,6 +34,7 @@ set(PROFILER_SOURCES
    profile_grouped_gemm_fastgelu.cpp
    profile_contraction_bilinear.cpp
    profile_contraction_scale.cpp
+    profile_batched_gemm_multi_d.cpp
 )
 set(PROFILER_EXECUTABLE ckProfiler)
@@ -77,5 +78,5 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgel
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
--- a/profiler/src/profile_batched_gemm.cpp
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -10,6 +10,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
 #include "profiler_operation_registry.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 enum struct GemmMatrixLayout
 {
    MK_KN_MN, // 0
@@ -78,55 +80,72 @@ int profile_batched_gemm(int argc, char* argv[])
    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;
-    auto profile = [&](auto a_type,
+    auto profile =
-                       auto b_type,
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
-                       auto c_type,
+            using ADataType = decltype(a_type);
-                       auto a_layout,
+            using BDataType = decltype(b_type);
-                       auto b_layout,
+            using CDataType = decltype(c_type);
-                       auto c_layout) {
-        using ADataType = decltype(a_type);
+            using ALayout = decltype(a_layout);
-        using BDataType = decltype(b_type);
+            using BLayout = decltype(b_layout);
-        using CDataType = decltype(c_type);
+            using CLayout = decltype(c_layout);
-        using ALayout = decltype(a_layout);
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
-        using BLayout = decltype(b_layout);
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
-        using CLayout = decltype(c_layout);
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
-        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
-        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
-        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+            const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
-        const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+            const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
-        const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+            const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
-        const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+            const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
-        const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+            const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
-        const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+            const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
-        const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+            const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
-        const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+            using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-        const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+            using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-        const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+            using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-        bool pass = ck::profiler::
+            using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
-            profile_batched_gemm_impl<ADataType, BDataType, CDataType, ALayout, BLayout, CLayout>(
+                                                                             BLayout,
-                do_verification,
+                                                                             CLayout,
-                init_method,
+                                                                             ADataType,
-                do_log,
+                                                                             BDataType,
-                time_kernel,
+                                                                             CDataType,
-                M,
+                                                                             AElementOp,
-                N,
+                                                                             BElementOp,
-                K,
+                                                                             CElementOp>;
-                BatchStrideA_,
-                BatchStrideB_,
+            bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
-                BatchStrideC_,
+                                                                BDataType,
-                StrideA_,
+                                                                CDataType,
-                StrideB_,
+                                                                ALayout,
-                StrideC_,
+                                                                BLayout,
-                BatchCount);
+                                                                CLayout,
+                                                                AElementOp,
-        return pass ? 0 : 1;
+                                                                BElementOp,
-    };
+                                                                CElementOp,
+                                                                DeviceOp>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
+                                                                          StrideA_,
+                                                                          StrideB_,
+                                                                          StrideC_,
+                                                                          BatchCount);
+            return pass ? 0 : 1;
+        };
    if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
    {

--- a/profiler/src/profile_batched_gemm_multi_d.cpp
+++ b/profiler/src/profile_batched_gemm_multi_d.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+enum struct GemmDataType
+{
+    F16_F16_F16,    // 0
+    INT8_INT8_INT8, // 1
+};
+#define OP_NAME "batched_gemm_multi_d"
+#define OP_DESC "Batched GEMM multi D"
+int profile_batched_gemm_multi_d(int argc, char* argv[])
+{
+    if(argc != 18)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp16; 1: int8)\n");
+        printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
+        printf("                     2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
+        // clang-format on
+        exit(1);
+    }
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+    const int BatchStrideA = std::stoi(argv[14]);
+    const int BatchStrideB = std::stoi(argv[15]);
+    const int BatchStrideC = std::stoi(argv[16]);
+    const int BatchCount = std::stoi(argv[17]);
+    using F16  = ck::half_t;
+    using INT8 = int8_t;
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+    auto profile =
+        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
+            using ADataType  = decltype(a_type);
+            using BDataType  = decltype(b_type);
+            using CDataType  = decltype(c_type);
+            using DsDataType = ck::Tuple<>;
+            using ALayout  = decltype(a_layout);
+            using BLayout  = decltype(b_layout);
+            using CLayout  = decltype(c_layout);
+            using DsLayout = ck::Tuple<>;
+            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+            const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+            const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+            const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+            const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+            const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+            const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+            const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+            const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+            const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+            using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+            using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemmMultiD<ALayout,
+                                                                                   BLayout,
+                                                                                   DsLayout,
+                                                                                   CLayout,
+                                                                                   ADataType,
+                                                                                   BDataType,
+                                                                                   DsDataType,
+                                                                                   CDataType,
+                                                                                   AElementOp,
+                                                                                   BElementOp,
+                                                                                   CElementOp>;
+            bool pass = ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                                BDataType,
+                                                                CDataType,
+                                                                ALayout,
+                                                                BLayout,
+                                                                CLayout,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp,
+                                                                DeviceOp>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          BatchStrideA_,
+                                                                          BatchStrideB_,
+                                                                          BatchStrideC_,
+                                                                          StrideA_,
+                                                                          StrideB_,
+                                                                          StrideC_,
+                                                                          BatchCount);
+            return pass ? 0 : 1;
+        };
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+        return 1;
+    }
+}
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_multi_d);
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -58,6 +58,7 @@ add_subdirectory(elementwise_normalization)
 add_subdirectory(batchnorm)
 add_subdirectory(contraction)
 add_subdirectory(pool_fwd)
+add_subdirectory(batched_gemm_multi_d)
 if(GPU_TARGETS MATCHES "gfx1100")
    add_subdirectory(wmma_op)
 endif()
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -5,6 +5,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 namespace {
 using ADataType = ck::bhalf_t;
 using BDataType = ck::bhalf_t;
@@ -12,6 +14,8 @@ using CDataType = ck::bhalf_t;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 int main()
@@ -23,21 +27,87 @@ int main()
    bool pass = true;
-    pass = pass &&
+    using namespace ck::tensor_operation::device;
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
    std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -5,6 +5,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 namespace {
 using ADataType = ck::half_t;
 using BDataType = ck::half_t;
@@ -12,6 +14,8 @@ using CDataType = ck::half_t;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 int main()
@@ -23,21 +27,87 @@ int main()
    bool pass = true;
-    pass = pass &&
+    using namespace ck::tensor_operation::device;
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
    std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -5,6 +5,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 namespace {
 using ADataType = float;
 using BDataType = float;
@@ -12,6 +14,8 @@ using CDataType = float;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 int main()
@@ -23,21 +27,87 @@ int main()
    bool pass = true;
-    pass = pass &&
+    using namespace ck::tensor_operation::device;
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
    std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -5,6 +5,8 @@
 #include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 namespace {
 using ADataType = int8_t;
 using BDataType = int8_t;
@@ -12,6 +14,8 @@ using CDataType = int8_t;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 } // namespace
 int main()
@@ -23,21 +27,87 @@ int main()
    bool pass = true;
-    pass = pass &&
+    using namespace ck::tensor_operation::device;
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
-               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
+                                                           BDataType,
+                                                           CDataType,
+                                                           Row,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Row,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Row,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Row,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Row,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-    pass = pass &&
+    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+                                                           BDataType,
-               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+                                                           CDataType,
+                                                           Col,
+                                                           Col,
+                                                           Row,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           DeviceBatchedGemm<Col,
+                                                                             Col,
+                                                                             Row,
+                                                                             ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             PassThrough,
+                                                                             PassThrough,
+                                                                             PassThrough>>(
+                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
    std::cout << "test BatchedGEMM int8: " << (pass ? "Pass" : "Fail") << std::endl;
    return pass ? 0 : 1;

--- a/test/batched_gemm_multi_d/CMakeLists.txt
+++ b/test/batched_gemm_multi_d/CMakeLists.txt
+# TODO: Enable for gfx90a after complier fix
+if(NOT GPU_TARGETS MATCHES "gfx90a")
+    add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
+    target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
+endif()
--- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
+++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <gtest/gtest.h>
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
+namespace {
+using F16 = ck::half_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using Empty_Tuple = ck::Tuple<>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+template <typename Tuple>
+class TestBatchedGemmMultiD : public ::testing::Test
+{
+    protected:
+    using ALayout = std::tuple_element_t<0, Tuple>;
+    using BLayout = std::tuple_element_t<1, Tuple>;
+    using CLayout = std::tuple_element_t<2, Tuple>;
+    static constexpr int M          = 512;
+    static constexpr int N          = 256;
+    static constexpr int K          = 128;
+    static constexpr int BatchCount = 3;
+    template <typename DataType>
+    void Run()
+    {
+        using namespace ck::tensor_operation::device;
+        const bool pass =
+            ck::profiler::profile_batched_gemm_impl<DataType,
+                                                    DataType,
+                                                    DataType,
+                                                    ALayout,
+                                                    BLayout,
+                                                    CLayout,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    DeviceBatchedGemmMultiD<ALayout,
+                                                                            BLayout,
+                                                                            Empty_Tuple,
+                                                                            CLayout,
+                                                                            DataType,
+                                                                            DataType,
+                                                                            Empty_Tuple,
+                                                                            DataType,
+                                                                            PassThrough,
+                                                                            PassThrough,
+                                                                            PassThrough>>(
+                true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+        EXPECT_TRUE(pass);
+    }
+};
+using KernelTypes = ::testing::Types<std::tuple<Row, Row, Row>,
+                                     std::tuple<Row, Col, Row>,
+                                     std::tuple<Col, Row, Row>,
+                                     std::tuple<Col, Col, Row>>;
+} // namespace
+TYPED_TEST_SUITE(TestBatchedGemmMultiD, KernelTypes);
+TYPED_TEST(TestBatchedGemmMultiD, f16) { this->template Run<F16>(); }
+TYPED_TEST(TestBatchedGemmMultiD, int8) { this->template Run<int8_t>(); }