Batched GEMM Multiple D based on Universal GEMM (#1655)

* Batched GEMM Multiple D based on Universal GEMM Co-authored-by: Jing Zhang <jizhan@fb.com> * CI fixes Co-authored-by: Jing Zhang <jizhan@fb.com> --------- Co-authored-by: Jing Zhang <jizhan@fb.com>

Batched GEMM Multiple D based on Universal GEMM (#1655)
* Batched GEMM Multiple D based on Universal GEMM Co-authored-by: Jing Zhang <jizhan@fb.com> * CI fixes Co-authored-by: Jing Zhang <jizhan@fb.com> --------- Co-authored-by: Jing Zhang <jizhan@fb.com>
754adc70 · Bartłomiej Kocot · GitHub · efb34741 · 754adc70 · 754adc70
Unverified Commit 754adc70 authored Nov 18, 2024 by Bartłomiej Kocot Committed by GitHub Nov 18, 2024
20 changed files
--- a/example/24_batched_gemm/CMakeLists.txt
+++ b/example/24_batched_gemm/CMakeLists.txt
@@ -9,6 +9,12 @@ add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp16)
 add_example_executable(example_batched_gemm_xdl_bf16 batched_gemm_xdl_bf16.cpp)
 add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16)

+add_example_executable(example_batched_gemm_xdl_bf16_v3 batched_gemm_xdl_bf16_v3.cpp)
+add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16_v3)
+
+add_example_executable(example_batched_gemm_xdl_fp8_rowwise_v3 batched_gemm_xdl_fp8_rowwise_v3.cpp)
+add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp8_rowwise_v3)
+
 add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp)
 add_example_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int8)


--- a/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_bf16_v3.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<
+    ALayout,
+    BLayout,
+    DsLayout,
+    ELayout,
+    ADataType,
+    BDataType,
+    DsDataType,
+    EDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmDefault,
+    256,            // BlockSize
+    256,            // MPerBlock
+    128,            // NPerBlock
+    32,             // KPerBlock
+    8,              // AK1
+    8,              // BK1
+    32,             // MPerXDL
+    32,             // NPerXDL
+    4,              // MXdlPerWave
+    2,              // NXdlPerWave
+    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+    2,              // ABlockTransferSrcVectorDim
+    8,              // ABlockTransferSrcScalarPerVector
+    8,              // ABlockTransferDstScalarPerVector_AK1
+    1,              // ABlockLdsExtraM
+    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
+    2,              // BBlockTransferSrcVectorDim
+    8,              // BBlockTransferSrcScalarPerVector
+    8,              // BBlockTransferDstScalarPerVector_BK1
+    1,              // BBlockLdsExtraN
+    1,              // CShuffleMXdlPerWavePerShuffle
+    1,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    S<8>,           // CDEShuffleBlockTransferScalarPerVectors
+    ck::BlockGemmPipelineScheduler::Intrawave, // BlockGemmPipelineScheduler
+    ck::BlockGemmPipelineVersion::v3           // BlockGemmPipelineVersion
+    >;
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
--- a/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp8_rowwise_v3.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F8   = ck::f8_t;
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough      = ck::tensor_operation::element_wise::PassThrough;
+using MultiplyMultiply = ck::tensor_operation::element_wise::MultiplyMultiply;
+
+using ADataType        = F8;
+using BDataType        = F8;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Col;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = MultiplyMultiply;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<
+    ALayout,
+    BLayout,
+    DsLayout,
+    ELayout,
+    ADataType,
+    BDataType,
+    DsDataType,
+    EDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmDefault,
+    256,            // BlockSize
+    256,            // MPerBlock
+    128,            // NPerBlock
+    32,             // KPerBlock
+    8,              // AK1
+    8,              // BK1
+    32,             // MPerXDL
+    32,             // NPerXDL
+    4,              // MXdlPerWave
+    2,              // NXdlPerWave
+    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+    2,              // ABlockTransferSrcVectorDim
+    8,              // ABlockTransferSrcScalarPerVector
+    8,              // ABlockTransferDstScalarPerVector_AK1
+    1,              // ABlockLdsExtraM
+    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
+    2,              // BBlockTransferSrcVectorDim
+    8,              // BBlockTransferSrcScalarPerVector
+    8,              // BBlockTransferDstScalarPerVector_BK1
+    1,              // BBlockLdsExtraN
+    1,              // CShuffleMXdlPerWavePerShuffle
+    1,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    S<8, 8, 1>,     // CDEShuffleBlockTransferScalarPerVectors
+    ck::BlockGemmPipelineScheduler::Interwave, // BlockGemmPipelineScheduler
+    ck::BlockGemmPipelineVersion::v1,          // BlockGemmPipelineVersion
+    F8                                         // ComputeTypeA
+    >;
+
+#include "run_batched_gemm_example_rowwise.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_rowwise_example(argc, argv); }
--- a/example/24_batched_gemm/run_batched_gemm_example.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example.inc
@@ -210,17 +210,9 @@ bool run_batched_gemm_example(int argc, char* argv[])

    problem_size.M = 256 * (dis(gen) + 1);
    problem_size.N = 128 * (dis(gen) + 1);
-    problem_size.K = 64 * (dis(gen) + 2);
+    problem_size.K = 128 * (dis(gen) + 2);

-    problem_size.stride_A = problem_size.K;
-    problem_size.stride_B = problem_size.K;
-    problem_size.stride_C = problem_size.N;
-
-    problem_size.batch_stride_A = problem_size.M * problem_size.K;
-    problem_size.batch_stride_B = problem_size.K * problem_size.N;
-    problem_size.batch_stride_C = problem_size.M * problem_size.N;
-
-    problem_size.batch_count = 16;
+    problem_size.batch_count = 2;

    if(argc == 4)
    {
@@ -228,13 +220,37 @@ bool run_batched_gemm_example(int argc, char* argv[])
        config.init_method     = std::stoi(argv[2]);
        config.time_kernel     = std::stoi(argv[3]);
    }
+    else if(argc == 8)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        problem_size.M           = std::stoi(argv[4]);
+        problem_size.N           = std::stoi(argv[5]);
+        problem_size.K           = std::stoi(argv[6]);
+        problem_size.batch_count = std::stoi(argv[7]);
+    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("optinal\n");
+        printf("arg4-7: M = %d N = %d K = %d Batch = %d\n",
+               problem_size.M,
+               problem_size.N,
+               problem_size.K,
+               problem_size.batch_count);
        exit(0);
    }

+    problem_size.stride_A = problem_size.K;
+    problem_size.stride_B = problem_size.K;
+    problem_size.stride_C = problem_size.N;
+
+    problem_size.batch_stride_A = problem_size.M * problem_size.K;
+    problem_size.batch_stride_B = problem_size.K * problem_size.N;
+    problem_size.batch_stride_C = problem_size.M * problem_size.N;
+
    return run_batched_gemm(problem_size, config);
 }
--- a/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
+++ b/example/24_batched_gemm/run_batched_gemm_example_rowwise.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <random>
+
+#pragma once
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t stride_A = K;
+    ck::index_t stride_B = K;
+    ck::index_t stride_C = N;
+
+    ck::index_t stride_D0 = 0;
+    ck::index_t stride_D1 = 0;
+
+    ck::index_t batch_stride_A = M * K;
+    ck::index_t batch_stride_B = K * N;
+    ck::index_t batch_stride_C = M * N;
+
+    ck::index_t batch_stride_D0 = N;
+    ck::index_t batch_stride_D1 = M;
+
+    ck::index_t batch_count = 16;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M,
+           N,
+           K,
+           stride_A,
+           stride_B,
+           stride_C,
+           stride_D0,
+           stride_D1,
+           batch_stride_A,
+           batch_stride_B,
+           batch_stride_C,
+           batch_stride_D0,
+           batch_stride_D1,
+           batch_count] = problem_size;
+
+    // GEMM shape
+    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
+    Tensor<D0DataType> d0_g_m_n(
+        f_host_tensor_descriptor(batch_count, M, N, stride_D0, batch_stride_D0, D0Layout{}));
+    Tensor<D1DataType> d1_g_m_n(
+        f_host_tensor_descriptor(batch_count, M, N, stride_D1, batch_stride_D1, D1Layout{}));
+    Tensor<EDataType> e_g_m_n_device_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "d0_g_m_n: " << d0_g_m_n.mDesc << std::endl;
+    std::cout << "d1_g_m_n: " << d1_g_m_n.mDesc << std::endl;
+    std::cout << "e_g_m_n: " << e_g_m_n_device_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    d0_g_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+    d1_g_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+
+    d0_device_buf.ToDevice(d0_g_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_g_m_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    // do GEMM
+    auto argument =
+        gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                          b_device_buf.GetDeviceBuffer(),
+                          {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                          c_device_buf.GetDeviceBuffer(),
+                          M,
+                          N,
+                          K,
+                          batch_count,
+                          stride_A,
+                          stride_B,
+                          {stride_D0, stride_D1},
+                          stride_C,
+                          batch_stride_A,
+                          batch_stride_B,
+                          {batch_stride_D0, batch_stride_D1},
+                          batch_stride_C,
+                          a_element_op,
+                          b_element_op,
+                          cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+    bool pass = true;
+
+    if(config.do_verification)
+    {
+        c_device_buf.FromDevice(e_g_m_n_device_result.mData.data());
+
+        Tensor<CShuffleDataType> c_g_m_n({batch_count, M, N});
+
+        using ReferenceBatchedGemmInstance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                             BDataType,
+                                                             CShuffleDataType,
+                                                             AccDataType,
+                                                             AElementOp,
+                                                             BElementOp,
+                                                             PassThrough>;
+
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+        Tensor<EDataType> e_g_m_n_host_result(
+            f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, c_g_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int b = 0; b < batch_count; ++b)
+        {
+            for(int m = 0; m < M; ++m)
+            {
+                for(int n = 0; n < N; ++n)
+                {
+                    cde_element_op(e_g_m_n_host_result(b, m, n),
+                                   c_g_m_n(b, m, n),
+                                   d0_g_m_n(b, m, n),
+                                   d1_g_m_n(b, m, n));
+                }
+            }
+        }
+
+        pass = ck::utils::check_err(
+            e_g_m_n_device_result, e_g_m_n_host_result, "Error: Incorrect results c");
+    }
+
+    if(config.time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+        std::size_t flop      = std::size_t(2) * batch_count * M * N * K;
+        std::size_t num_btype = sizeof(ADataType) * batch_count * M * K +
+                                sizeof(BDataType) * batch_count * K * N +
+                                sizeof(EDataType) * batch_count * M * N;
+
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    return pass ? 0 : 1;
+}
+
+bool run_batched_gemm_rowwise_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    std::mt19937 gen(11939);
+    std::uniform_int_distribution<int> dis(0, 15);
+
+    problem_size.M = 256 * (dis(gen) + 1);
+    problem_size.N = 128 * (dis(gen) + 1);
+    problem_size.K = 128 * (dis(gen) + 2);
+
+    problem_size.batch_count = 2;
+
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 8)
+    {
+        config.do_verification   = std::stoi(argv[1]);
+        config.init_method       = std::stoi(argv[2]);
+        config.time_kernel       = std::stoi(argv[3]);
+        problem_size.M           = std::stoi(argv[4]);
+        problem_size.N           = std::stoi(argv[5]);
+        problem_size.K           = std::stoi(argv[6]);
+        problem_size.batch_count = std::stoi(argv[7]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("optinal\n");
+        printf("arg4-7: M = %d N = %d K = %d Batch = %d\n",
+               problem_size.M,
+               problem_size.N,
+               problem_size.K,
+               problem_size.batch_count);
+        exit(0);
+    }
+
+    problem_size.stride_A = problem_size.K;
+    problem_size.stride_B = problem_size.K;
+    problem_size.stride_C = problem_size.N;
+
+    problem_size.stride_D0 = 0;
+    problem_size.stride_D1 = 0;
+
+    problem_size.batch_stride_A = problem_size.M * problem_size.K;
+    problem_size.batch_stride_B = problem_size.K * problem_size.N;
+    problem_size.batch_stride_C = problem_size.M * problem_size.N;
+
+    problem_size.batch_stride_D0 = problem_size.N;
+    problem_size.batch_stride_D1 = problem_size.M;
+
+    return run_batched_gemm_rowwise(problem_size, config);
+}
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -53,6 +53,47 @@ struct DeviceBatchedGemmMultiD : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedGemmV2MultiD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsisiten NumDTensor");
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_batched.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_ENABLE_BF16
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+#endif
+
+#ifdef CK_ENABLE_FP8
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          Empty_Tuple,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Empty_Tuple,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances);
+
+#endif
+
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatchedGemmV2MultiD<
+    ALayout,
+    BLayout,
+    DsLayout,
+    CLayout,
+    ADataType,
+    BDataType,
+    DsDataType,
+    CDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceBatchedGemmV2MultiD<ALayout,
+                                               BLayout,
+                                               DsLayout,
+                                               CLayout,
+                                               ADataType,
+                                               BDataType,
+                                               DsDataType,
+                                               CDataType,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough,
+                                               ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+
+                add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+
+                add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+
+#ifdef CK_ENABLE_FP8
+        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+
+                add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+
+                add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -81,6 +81,12 @@ function(add_instance_library INSTANCE_NAME)
         list(REMOVE_ITEM ARGN "${source}")
    endif()
    endforeach()
+    foreach(source IN LISTS ARGN)
+    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "batched_gemm_xdl_universal" AND source MATCHES "_f8_")
+         message("removing batched_gemm_universal_f8 instance ${source} ")
+         list(REMOVE_ITEM ARGN "${source}")
+    endif()
+    endforeach()
    endif()
    #only continue if there are some source files left on the list
    if(ARGN)
@@ -102,6 +108,9 @@ function(add_instance_library INSTANCE_NAME)
                if(source MATCHES "gemm_multiply_multiply_f8")
                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
                endif()
+                if(source MATCHES "bached_gemm_multiply_multiply_f8")
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
+                endif()
            endif()
            set(offload_targets)
            foreach(target IN LISTS INST_TARGETS)

--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/CMakeLists.txt
+# ONLY XDL_KERNELS
+set(GEMM_UNIVERSAL_BATCHED_INSTANCES)
+
+list(APPEND GEMM_UNIVERSAL_BATCHED_INSTANCES 
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+
+        device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+    )
+
+
+set_source_files_properties(device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+
+
+add_instance_library(device_gemm_universal_batched_instance ${GEMM_UNIVERSAL_BATCHED_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          BF16,
+                                                          BF16,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                                GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Intrawave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmV2MultiD<Row,
+                                                          Col,
+                                                          ck::Tuple<>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          ck::Tuple<>,
+                                                          BF16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances<Interwave,
+                                                                            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CElementOp,
+          typename DeviceOp>
+bool profile_gemm_universal_batched_impl(int do_verification,
+                                         int init_method,
+                                         bool do_log,
+                                         bool time_kernel,
+                                         int M,
+                                         int N,
+                                         int K,
+                                         int BatchStrideA,
+                                         int BatchStrideB,
+                                         int BatchStrideC,
+                                         int StrideA,
+                                         int StrideB,
+                                         int StrideC,
+                                         int BatchCount,
+                                         int n_warmup,
+                                         int n_iter,
+                                         uint64_t rotating = 0)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB, BatchStrideB, BLayout{}));
+    Tensor<CDataType> c_g_m_n_host_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_n_device_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{}));
+
+    int total_gemm_needed =
+        a_g_m_k.GetElementSpaceSizeInBytes() + b_g_k_n.GetElementSpaceSizeInBytes();
+    int rotating_count = std::max(
+        1,
+        std::min(n_iter,
+                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
+    std::cout << "rotating count: " << rotating_count << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    if(do_verification)
+    {
+        using ReferenceBatchedGemmInstance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                             BDataType,
+                                                             CDataType,
+                                                             float,
+                                                             AElementOp,
+                                                             BElementOp,
+                                                             CElementOp>;
+
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+    c_device_buf.ToDevice(c_g_m_n_device_result.mData.data());
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
+        // false branch for multi d dl kernel
+
+        argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                        {},
+                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                        M,
+                                        N,
+                                        K,
+                                        BatchCount,
+                                        StrideA,
+                                        StrideB,
+                                        {},
+                                        StrideC,
+                                        BatchStrideA,
+                                        BatchStrideB,
+                                        {},
+                                        BatchStrideC,
+                                        ck::tensor_operation::element_wise::PassThrough{},
+                                        ck::tensor_operation::element_wise::PassThrough{},
+                                        ck::tensor_operation::element_wise::PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init C to zero before profiling next kernel
+            c_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time = invoker_ptr->Run(
+                argument_ptr.get(),
+                StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter, true, rotating_count});
+
+            std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
+
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                     sizeof(CDataType) * M * N) *
+                                    BatchCount;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host: ", c_g_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "c_device: ", c_g_m_n_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << " B = " << BatchCount << " M = " << M << " N = " << N << " K = " << K
+              << " StrideA = " << StrideA << " StrideB = " << StrideB << " StrideC = " << StrideC
+              << ": " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -59,6 +59,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
  list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
@@ -141,6 +142,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  endif()
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)