Merge branch 'amd-develop' into amd-master

241c261f · Jun Liu · 1762f081 · e2eb0418 · 241c261f · 241c261f
Commit 241c261f authored Aug 21, 2024 by Jun Liu
20 changed files
--- a/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_reduce_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3r1.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_universal_reduce.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout>
+bool profile_gemm_universal_reduce_impl(int do_verification,
+                                        int init_method,
+                                        bool do_log,
+                                        bool time_kernel,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        int StrideA,
+                                        int StrideB,
+                                        int StrideC,
+                                        int KBatch,
+                                        int n_warmup,
+                                        int n_iter,
+                                        uint64_t rotating = 0)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    int total_gemm_needed = a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes();
+    int rotating_count    = std::max(
+        1,
+        std::min(n_iter,
+                 static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+    std::cout << "rotating count: " << rotating_count << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 2});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmV2R1<ALayout,
+                                                                  BLayout,
+                                                                  DsLayout,
+                                                                  CLayout,
+                                                                  ADataType,
+                                                                  BDataType,
+                                                                  DsDataType,
+                                                                  CDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference GEMM
+    if(do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;
+
+    // profile device GEMM instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 19, 20, 32, 38};
+
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }
+
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            {},
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            StrideA,
+                                            StrideB,
+                                            {},
+                                            StrideC,
+                                            kbatch_curr,
+                                            a_element_op,
+                                            b_element_op,
+                                            c_element_op);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+
+                DeviceMem gemm_workspace_dev(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
+                op_ptr->SetWorkSpacePointer(
+                    argument_ptr.get(), gemm_workspace_dev.GetDeviceBuffer(), StreamConfig{});
+
+                // re-init C to zero before profiling next kernel
+                c_device_buf.SetZero();
+
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+
+                if(do_verification)
+                {
+                    c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                    pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                            << std::endl;
+                    }
+                }
+
+                std::string op_name = op_ptr->GetTypeString();
+
+                float ave_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0,
+                                                               n_warmup,
+                                                               n_iter,
+                                                               rotating_count > 1,
+                                                               rotating_count});
+
+                std::size_t flop = std::size_t(2) * M * N * K;
+
+                std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                        sizeof(CDataType) * M * N;
+
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+                float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
+                          << kbatch_curr << std::endl;
+
+#if defined CK_ENABLE_FP8
+                // set softer tolerances for fp8
+                if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
+                             is_same_v<CDataType, f8_t>)
+                {
+                    std::string msg = "Error: Incorrect results!";
+                    double rtol     = 1e-1;
+                    double atol     = 1e-1;
+                    pass            = pass & ck::utils::check_err(
+                                      c_m_n_device_result, c_m_n_host_result, msg, rtol, atol);
+                }
+                else
+                {
+#endif
+                    pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+#if defined CK_ENABLE_FP8
+                }
+#endif
+
+                if(tflops > best_tflops)
+                {
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_ave_time   = ave_time;
+                    best_gb_per_sec = gb_per_sec;
+                    best_kbatch     = kbatch_curr;
+                }
+            }
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -33,7 +33,8 @@ template <ck::index_t NDimSpatial,
          typename WeiDataType,
          typename OutDataType,
          typename AComputeType = InDataType,
-          typename BComputeType = AComputeType>
+          typename BComputeType = AComputeType,
+          typename IndexType    = ck::index_t>
 bool profile_grouped_conv_fwd_impl(int do_verification,
                                   int init_method,
                                   bool do_log,
@@ -57,16 +58,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    const auto out_g_n_k_wos_desc =
        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);

-    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
-    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
-    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
-    std::array<ck::index_t, NDimSpatial> input_left_pads{};
-    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_dilations{};
+    std::array<IndexType, NDimSpatial> input_left_pads{};
+    std::array<IndexType, NDimSpatial> input_right_pads{};

    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };


--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -46,12 +46,17 @@ if(GPU_TARGETS MATCHES "gfx9")
    list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
  endif()
  list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
+  if(GPU_TARGETS MATCHES "gfx94")
+    list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
+  endif()
  list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
  list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
  list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
  list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp)
@@ -79,6 +84,11 @@ set(PROFILER_EXECUTABLE ckProfiler)

 add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
 target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
+# flags to compress the library
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
+  message("Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
+  target_compile_options(${PROFILER_EXECUTABLE} PRIVATE --offload-compress)
+endif()

 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
@@ -120,8 +130,13 @@ if(GPU_TARGETS MATCHES "gfx9")
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
+  if(GPU_TARGETS MATCHES "gfx94")
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
+  endif()
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)

--- a/profiler/src/profile_gemm_ab_scale.cpp
+++ b/profiler/src/profile_gemm_ab_scale.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_ab_scale_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+    F8_F16_F16,     // 4
+    F16_F8_F16,     // 5
+    F16_F16_F16_F8, // 6
+    F8_F8_BF16,     // 7
+};
+
+enum struct ScaleBlockTile
+{
+    Tile_128_128_128, // 0
+};
+
+#define OP_NAME "gemm_ab_scale"
+#define OP_DESC "GEMM_AB_Scale"
+
+int profile_gemm_ab_scale(int argc, char* argv[])
+{
+    if(argc != 15 && argc != 18)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
+               "f16->f8; 7: f8->bf16, "
+               "comp f8)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: scale block tile (0: ScaleBlockM/N/K = [128, 128, 128];\n");
+        printf("arg5: verification (0: no; 1: yes)\n");
+        printf("arg6: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg7: print tensor value (0: no; 1: yes)\n");
+        printf("arg8: time kernel (0=no, 1=yes)\n");
+        printf("arg9 to 14: M, N, K, StrideA, StrideB, StrideE\n");
+        printf("optional:\n");
+        printf("arg15: number of warm-up cycles (default 1)\n");
+        printf("arg16: number of iterations (default 10)\n");
+        printf("arg17: memory for rotating buffer (default 0, size in MB)\n");
+        exit(1);
+    }
+
+    const auto data_type        = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout           = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const auto scale_block_tile = static_cast<ScaleBlockTile>(std::stoi(argv[4]));
+    const bool do_verification  = std::stoi(argv[5]);
+    const int init_method       = std::stoi(argv[6]);
+    const bool do_log           = std::stoi(argv[7]);
+    const bool time_kernel      = std::stoi(argv[8]);
+
+    const int M = std::stoi(argv[9]);
+    const int N = std::stoi(argv[10]);
+    const int K = std::stoi(argv[11]);
+
+    const int StrideA = std::stoi(argv[12]);
+    const int StrideB = std::stoi(argv[13]);
+    const int StrideE = std::stoi(argv[14]);
+
+    int n_warmup      = 1;
+    int n_iter        = 10;
+    uint64_t rotating = 0;
+    if(argc == 18)
+    {
+        n_warmup = std::stoi(argv[15]);
+        n_iter   = std::stoi(argv[16]);
+        rotating = std::stoull(argv[17]) * 1024 * 1024;
+    }
+
+    using F32  = float;
+    using BF16 = ck::bhalf_t;
+    using F8   = ck::f8_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a0_type,
+                       auto a1_type,
+                       auto b0_type,
+                       auto b1_type,
+                       auto comp_type,
+                       auto acc_type,
+                       auto c_type,
+                       auto scale_block_m,
+                       auto scale_block_n,
+                       auto scale_block_k,
+                       auto a_layout,
+                       auto b_layout,
+                       auto e_layout) {
+        using A0DataType      = decltype(a0_type);
+        using A1DataType      = decltype(a1_type);
+        using B0DataType      = decltype(b0_type);
+        using B1DataType      = decltype(b1_type);
+        using ComputeDataType = decltype(comp_type);
+        using AccDataType     = decltype(acc_type);
+        using EDataType       = decltype(c_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using ELayout = decltype(e_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_ab_scale_impl<A0DataType,
+                                                             A1DataType,
+                                                             B0DataType,
+                                                             B1DataType,
+                                                             ComputeDataType,
+                                                             AccDataType,
+                                                             EDataType,
+                                                             scale_block_m,
+                                                             scale_block_n,
+                                                             scale_block_k,
+                                                             ALayout,
+                                                             BLayout,
+                                                             ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideE < 0) ? DefaultStrideE : StrideE,
+            n_warmup,
+            n_iter,
+            rotating);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN &&
+       scale_block_tile == ScaleBlockTile::Tile_128_128_128)
+    {
+        return profile(F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       F8{},
+                       F32{},
+                       BF16{},
+                       ck::Number<128>{},
+                       ck::Number<128>{},
+                       ck::Number<128>{},
+                       Row{},
+                       Col{},
+                       Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_ab_scale);
--- a/profiler/src/profile_gemm_multiply_multiply.cpp
+++ b/profiler/src/profile_gemm_multiply_multiply.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_multiply_multiply_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+    F8_F16_F16,     // 4
+    F16_F8_F16,     // 5
+    F16_F16_F16_F8, // 6
+    F8_F8_BF16,     // 7
+};
+
+#define OP_NAME "gemm_multiply_multiply"
+#define OP_DESC "GEMM_Multiply_Multiply"
+
+int profile_gemm_multiply_multiply(int argc, char* argv[])
+{
+    if(argc != 16 && argc != 19)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
+               "f16->f8; 7: f8->bf16, "
+               "comp f8)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+        printf("optional:\n");
+        printf("arg16: number of warm-up cycles (default 1)\n");
+        printf("arg17: number of iterations (default 10)\n");
+        printf("arg18: memory for rotating buffer (default 0, size in MB)\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA  = std::stoi(argv[11]);
+    const int StrideB  = std::stoi(argv[12]);
+    const int StrideD0 = std::stoi(argv[13]);
+    const int StrideD1 = std::stoi(argv[14]);
+    const int StrideE  = std::stoi(argv[15]);
+
+    int n_warmup      = 1;
+    int n_iter        = 10;
+    uint64_t rotating = 0;
+    if(argc == 19)
+    {
+        n_warmup = std::stoi(argv[16]);
+        n_iter   = std::stoi(argv[17]);
+        rotating = std::stoull(argv[18]) * 1024 * 1024;
+    }
+
+    using F32  = float;
+    using BF16 = ck::bhalf_t;
+    using F8   = ck::f8_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto comp_type,
+                       auto acc_type,
+                       auto d0_type,
+                       auto d1_type,
+                       auto c_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto d0_layout,
+                       auto d1_layout,
+                       auto e_layout) {
+        using ADataType       = decltype(a_type);
+        using BDataType       = decltype(b_type);
+        using ComputeDataType = decltype(comp_type);
+        using D0DataType      = decltype(d0_type);
+        using D1DataType      = decltype(d1_type);
+        using AccDataType     = decltype(acc_type);
+        using EDataType       = decltype(c_type);
+
+        using ALayout  = decltype(a_layout);
+        using BLayout  = decltype(b_layout);
+        using D0Layout = decltype(d0_layout);
+        using D1Layout = decltype(d1_layout);
+        using ELayout  = decltype(e_layout);
+
+        const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB  = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+        const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
+        const int DefaultStrideE  = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_multiply_multiply_impl<ADataType,
+                                                                      BDataType,
+                                                                      ComputeDataType,
+                                                                      AccDataType,
+                                                                      D0DataType,
+                                                                      D1DataType,
+                                                                      EDataType,
+                                                                      ALayout,
+                                                                      BLayout,
+                                                                      D0Layout,
+                                                                      D1Layout,
+                                                                      ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
+            (StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
+            (StrideE < 0) ? DefaultStrideE : StrideE,
+            n_warmup,
+            n_iter,
+            rotating);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(
+            F8{}, F8{}, F8{}, F32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_multiply_multiply);
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <numeric>
@@ -26,6 +26,7 @@ enum struct GemmDataType
    F8_F16_F16,     // 4
    F16_F8_F16,     // 5
    F16_F16_F16_F8, // 6
+    F8_F8_BF16,     // 7
 };

 #define OP_NAME "gemm_universal"
@@ -36,7 +37,8 @@ int profile_gemm_universal(int argc, char* argv[])
    if(argc != 15 && argc != 18)
    {
        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
-        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16, "
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
+               "f16->f8; 7: f8->bf16, "
               "comp f8)\n");
        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
@@ -91,15 +93,17 @@ int profile_gemm_universal(int argc, char* argv[])

    auto profile = [&](auto a_type,
                       auto b_type,
+                       auto comp_type,
                       auto acc_type,
                       auto c_type,
                       auto a_layout,
                       auto b_layout,
                       auto c_layout) {
-        using ADataType   = decltype(a_type);
-        using BDataType   = decltype(b_type);
-        using AccDataType = decltype(acc_type);
-        using CDataType   = decltype(c_type);
+        using ADataType       = decltype(a_type);
+        using BDataType       = decltype(b_type);
+        using ComputeDataType = decltype(comp_type);
+        using AccDataType     = decltype(acc_type);
+        using CDataType       = decltype(c_type);

        using ALayout = decltype(a_layout);
        using BLayout = decltype(b_layout);
@@ -111,6 +115,7 @@ int profile_gemm_universal(int argc, char* argv[])

        bool pass = ck::profiler::profile_gemm_universal_impl<ADataType,
                                                              BDataType,
+                                                              ComputeDataType,
                                                              AccDataType,
                                                              CDataType,
                                                              ALayout,
@@ -136,35 +141,39 @@ int profile_gemm_universal(int argc, char* argv[])

    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
-        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
    }
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
-        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+        return profile(F16{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
    }
    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
-        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
    }
    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
-        return profile(F16{}, F8{}, F32{}, F16{}, Row{}, Col{}, Row{});
+        return profile(F16{}, F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
    }
    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
-        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+        return profile(F8{}, F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
    }
    else if(data_type == GemmDataType::F8_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
-        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+        return profile(F8{}, F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
    }
    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+        return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
    }
    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
-        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+        return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
    }
    else
    {

--- a/profiler/src/profile_gemm_universal_reduce.cpp
+++ b/profiler/src/profile_gemm_universal_reduce.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_universal_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+    F8_F16_F16,     // 4
+    BF16_I8_BF16,   // 5
+    F16_F16_F16_F8, // 6
+};
+
+#define OP_NAME "gemm_universal_reduce"
+#define OP_DESC "Universal GEMM"
+
+int profile_gemm_universal_reduce(int argc, char* argv[])
+{
+    if(argc != 15 && argc != 18)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@i8; 6: f16, "
+               "comp f8)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("arg14: split k into  mulitiple batch\n");
+        printf("optional:\n");
+        printf("arg15: number of warm-up cycles (default 1)\n");
+        printf("arg16: number of iterations (default 10)\n");
+        printf("arg17: memory for rotating buffer (default 0, size in MB)\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+    const int KBatch  = std::stoi(argv[14]);
+
+    int n_warmup      = 1;
+    int n_iter        = 10;
+    uint64_t rotating = 0;
+    if(argc == 18)
+    {
+        n_warmup = std::stoi(argv[15]);
+        n_iter   = std::stoi(argv[16]);
+        rotating = std::stoull(argv[17]) * 1024 * 1024;
+    }
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using I8   = int8_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+
+    using DsDataType = ck::Tuple<>;
+    using DsLayout   = ck::Tuple<>;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto c_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto c_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using CDataType   = decltype(c_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using CLayout = decltype(c_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_universal_reduce_impl<ADataType,
+                                                                     BDataType,
+                                                                     DsDataType,
+                                                                     AccDataType,
+                                                                     CDataType,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     DsLayout,
+                                                                     CLayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideC < 0) ? DefaultStrideC : StrideC,
+            KBatch,
+            n_warmup,
+            n_iter,
+            rotating);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(BF16{}, I8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_universal_reduce);
--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -29,6 +29,12 @@ enum struct ConvDataType
    BF8_F8_F8,      // 7
 };

+enum struct IndexType
+{
+    INDEX_T,      // 0
+    LONG_INDEX_T, // 1
+};
+
 #define OP_NAME "grouped_conv_fwd"
 #define OP_DESC "Grouped Convolution Forward"

@@ -45,12 +51,13 @@ static void print_helper_msg()
        << "                 5: Input bf8, Weight bf8, Output fp8\n"
        << "                 6: Input fp8, Weight bf8, Output fp8\n"
        << "                 7: Input bf8, Weight fp8, Output fp8)\n"
-        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
+        << "arg3: indexing data type (0: 32-bit, 1: 64-bit)\n"
+        << "arg4: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
-        << "arg4: verification (0: no, 1: yes)\n"
-        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
-        << "arg6: print tensor value (0: no; 1: yes)\n"
-        << "arg7: time kernel (0: no, 1: yes)\n"
+        << "arg5: verification (0: no, 1: yes)\n"
+        << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg7: print tensor value (0: no; 1: yes)\n"
+        << "arg8: time kernel (0: no, 1: yes)\n"
        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
    // clang-format on
 }
@@ -60,7 +67,7 @@ static void print_helper_msg()
 int profile_grouped_conv_fwd(int argc, char* argv[])
 {
    // 8 for control, 1 for num_dim_spatial
-    if(argc < 9)
+    if(argc < 10)
    {
        print_helper_msg();
        return 1;
@@ -68,20 +75,21 @@ int profile_grouped_conv_fwd(int argc, char* argv[])

    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
-    const bool do_verification = std::stoi(argv[4]);
-    const int init_method      = std::stoi(argv[5]);
-    const bool do_log          = std::stoi(argv[6]);
-    const bool time_kernel     = std::stoi(argv[7]);
-    const int num_dim_spatial  = std::stoi(argv[8]);
+    const auto index_type      = static_cast<IndexType>(std::stoi(argv[4]));
+    const bool do_verification = std::stoi(argv[5]);
+    const int init_method      = std::stoi(argv[6]);
+    const bool do_log          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[8]);
+    const int num_dim_spatial  = std::stoi(argv[9]);

-    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
-    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
+    // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 9 + 1 + 4 + 6 * num_dim_spatial)
    {
        print_helper_msg();
        return 1;
    }

-    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);

    using F32  = float;
    using F16  = ck::half_t;
@@ -138,18 +146,43 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        using AComputeType = decltype(a_compute_type);
        using BComputeType = decltype(b_compute_type);

-        bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
-                                                                InLayout,
-                                                                WeiLayout,
-                                                                OutLayout,
-                                                                InDataType,
-                                                                WeiDataType,
-                                                                OutDataType,
-                                                                AComputeType,
-                                                                BComputeType>(
-            do_verification, init_method, do_log, time_kernel, params);
+        if(index_type == IndexType::INDEX_T)
+        {
+            bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    OutDataType,
+                                                                    AComputeType,
+                                                                    BComputeType,
+                                                                    ck::index_t>(
+                do_verification, init_method, do_log, time_kernel, params);
+
+            return pass ? 0 : 1;
+        }
+        else if(index_type == IndexType::LONG_INDEX_T)
+        {
+            bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    OutDataType,
+                                                                    AComputeType,
+                                                                    BComputeType,
+                                                                    ck::long_index_t>(
+                do_verification, init_method, do_log, time_kernel, params);

-        return pass ? 0 : 1;
+            return pass ? 0 : 1;
+        }
+        else
+        {
+            std::cout << "this indexing data type is not implemented" << std::endl;
+            return 1;
+        }
    };

    // GNHWC_GKYXC_GNHWK

--- a/python/ck4inductor/universal_gemm/gen_instances.py
+++ b/python/ck4inductor/universal_gemm/gen_instances.py
@@ -62,17 +62,13 @@ def parse_instances(str_instances: List[str]) -> List[CKGemmOperation]:
                    i_current = i_next + 1
            if i_next == -1:
                break
-        # pad with `None`s for the fields which are not defined in the instance
+
+        template_args.insert(2, tuple())  # ds layout
+        template_args.insert(6, tuple())  # ds dtype
+
        new_instance = CKGemmOperation(
            *template_args,  # type: ignore[arg-type]
-            *((None,) * (len(fields(CKGemmOperation)) - len(template_args))),
        )
-        # the last 2 template parameters are optional
-        # if they are absent, substitute them with default values from Universal Gemm C++ template declaration
-        if new_instance.a_compute_dtype is None:
-            new_instance.a_compute_dtype = new_instance.c_element_dtype
-        if new_instance.b_compute_dtype is None:
-            new_instance.b_compute_dtype = new_instance.c_element_dtype

        op_instances.append(new_instance)
    return op_instances
@@ -208,6 +204,8 @@ def gen_ops_preselected() -> List[CKGemmOperation]:
        a_layout="Row",
        b_layout="Col",
        c_layout="Row",
+        ds_element_dtypes=tuple(),
+        ds_layouts=tuple(),
        a_element_dtype="F16",
        b_element_dtype="F16",
        c_element_dtype="F16",

--- a/python/ck4inductor/universal_gemm/op.py
+++ b/python/ck4inductor/universal_gemm/op.py
@@ -10,10 +10,12 @@ class CKGemmOperation:

    a_layout: str
    b_layout: str
+    ds_layouts: Tuple[str]  # addmm specific
    c_layout: str

    a_element_dtype: str
    b_element_dtype: str
+    ds_element_dtypes: Tuple[str]  # addmm specific
    c_element_dtype: str

    acc_dtype: str
@@ -64,16 +66,15 @@ class CKGemmOperation:
        Tuple[int, int, int, int]
    )
    c_shuffle_block_transfer_scalar_per_vector_n_per_block: int
-
    block_gemm_pipeline_scheduler: str
-    block_gemm_pipeline_version: Optional[str]
+    block_gemm_pipeline_version: str

-    a_compute_dtype: Optional[str]
-    b_compute_dtype: Optional[str]
+    a_compute_dtype: Optional[str] = None
+    b_compute_dtype: Optional[str] = None

    def name(self):
        # cpp alias for template instance
-        return f"ck_devicegemm_xdl_shuffle_v3_{self.key_name()}"
+        return f"ck_devicegemm_multid_xdl_shuffle_v3_{self.key_name()}"

    def key_name(self):
        # TBD; must be unique per instance. Intended to use as dict key

--- a/script/check_copyright_year.sh
+++ b/script/check_copyright_year.sh
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -143,6 +143,12 @@ def parse_logfile(logfile):
            if 'Best Perf' in line:
                lst=line.split()
                res.append(lst[36])
+    elif 'perf_fmha' in logfile:
+        for line in open(logfile):
+            if 'TFlops' in line:
+                lst=line.split()
+                line_dict=dict(zip(lst[1:],lst))
+                res.append(line_dict['TFlops,'])
    return res


@@ -304,6 +310,14 @@ def main():
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_mixed_gemm_tflops"
+        if 'fmha_fwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_fmha_fwd_tflops"
+        if 'fmha_bwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_fmha_bwd_tflops"

        tflops_base = get_baseline(table_name,conn)
        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)

--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -13,3 +13,20 @@
 python3 process_perf_data.py perf_gemm.log
 python3 process_perf_data.py perf_resnet50_N256.log
 python3 process_perf_data.py perf_resnet50_N4.log
+
+file=./perf_fmha_fwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx942.log
+fi
+file=./perf_fmha_bwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx942.log
+fi
+file=./perf_fmha_fwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
+fi
+file=./perf_fmha_bwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
+fi
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -21,3 +21,20 @@ python3 process_perf_data.py perf_gemm_bilinear.log
 python3 process_perf_data.py perf_reduction.log
 python3 process_perf_data.py perf_splitK_gemm.log
 python3 process_perf_data.py perf_onnx_gemm.log
+
+file=./perf_fmha_fwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx942.log
+fi
+file=./perf_fmha_bwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx942.log
+fi
+file=./perf_fmha_fwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
+fi
+file=./perf_fmha_bwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
+fi
--- a/script/test_reduce_with_index.sh
+++ b/script/test_reduce_with_index.sh
-#!/bin/bash
-
-## The following will be used for CI
-
-set -x
-
-## for float
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
-
-## for float64
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  6 2
-
-## for float16
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  1 2
-
-## for int8_t
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  3 2
-
-## for bfloat16
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  5 2
-
-set +x
-
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,11 +68,11 @@ function(add_test_executable TEST_NAME)
    #only continue if there are some source files left on the list
    if(ARGN)
        if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(ARGN MATCHES "_wmma")
             list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})
@@ -149,11 +149,11 @@ function(add_gtest_executable TEST_NAME)
    #only continue if there are some source files left on the list
    if(ARGN)
        if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(ARGN MATCHES "_wmma")
             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})

--- a/test/conv_util/conv_util.cpp
+++ b/test/conv_util/conv_util.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <iostream>
 #include <string>
@@ -24,12 +24,12 @@ class TestConvUtil : public ::testing::Test
                                                 128,
                                                 192,
                                                 256,
-                                                 std::vector<ck::index_t>(ndims, 3),
-                                                 std::vector<ck::index_t>(ndims, 71),
-                                                 std::vector<ck::index_t>(ndims, s),
-                                                 std::vector<ck::index_t>(ndims, d),
-                                                 std::vector<ck::index_t>(ndims, p),
-                                                 std::vector<ck::index_t>(ndims, p));
+                                                 std::vector<ck::long_index_t>(ndims, 3),
+                                                 std::vector<ck::long_index_t>(ndims, 71),
+                                                 std::vector<ck::long_index_t>(ndims, s),
+                                                 std::vector<ck::long_index_t>(ndims, d),
+                                                 std::vector<ck::long_index_t>(ndims, p),
+                                                 std::vector<ck::long_index_t>(ndims, p));
    }

    protected:
@@ -48,35 +48,35 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths1D)
 {
    // stride 2, dilation 1, pad 1
    SetNDParams(1, 2, 1, 1);
-    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    std::vector<ck::long_index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D."));
+        out_spatial_len, std::vector<ck::long_index_t>{36}, "Error: ConvParams 1D."));

    // stride 1, dilation 1, pad 1
    SetNDParams(1, 1, 1, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}."));
+        out_spatial_len, std::vector<ck::long_index_t>{71}, "Error: ConvParams 1D stride {1}."));

    // stride 2, dilation 1, pad 2
    SetNDParams(1, 2, 1, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{37},
+                                     std::vector<ck::long_index_t>{37},
                                     "Error: ConvParams 1D padding left/right {2}."));

    // stride 2, dilation 2, pad 2
    SetNDParams(1, 2, 2, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}."));
+        out_spatial_len, std::vector<ck::long_index_t>{36}, "Error: ConvParams 1D dilation {2}."));

    // stride 3, dilation 2, pad 1
    SetNDParams(1, 3, 2, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(
        ck::utils::check_err(out_spatial_len,
-                             std::vector<ck::index_t>{23},
+                             std::vector<ck::long_index_t>{23},
                             "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."));
 }

@@ -84,36 +84,38 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths2D)
 {
    // stride 2, dilation 1, pad 1
    SetNDParams(2, 2, 1, 1);
-    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    std::vector<ck::long_index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{36, 36},
+                                     std::vector<ck::long_index_t>{36, 36},
                                     "Error: ConvParams 2D default constructor."));

    // stride 1, dilation 1, pad 1
    SetNDParams(2, 1, 1, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
-    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}."));
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
+                                     std::vector<ck::long_index_t>{71, 71},
+                                     "Error: ConvParams 2D stride {1,1}."));

    // stride 2, dilation 1, pad 2
    SetNDParams(2, 2, 1, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{37, 37},
+                                     std::vector<ck::long_index_t>{37, 37},
                                     "Error: ConvParams 2D padding left/right {2,2}."));

    // stride 2, dilation 2, pad 2
    SetNDParams(2, 2, 2, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
-    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}."));
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
+                                     std::vector<ck::long_index_t>{36, 36},
+                                     "Error: ConvParams 2D dilation {2,2}."));

    // stride 3, dilation 2, pad 1
    SetNDParams(2, 3, 2, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(
        ck::utils::check_err(out_spatial_len,
-                             std::vector<ck::index_t>{23, 23},
+                             std::vector<ck::long_index_t>{23, 23},
                             "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."));
 }

@@ -121,29 +123,29 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths3D)
 {
    // stride 2, dilation 1, pad 1
    SetNDParams(3, 2, 1, 1);
-    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    std::vector<ck::long_index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D."));
+        out_spatial_len, std::vector<ck::long_index_t>{36, 36, 36}, "Error: ConvParams 3D."));

    // stride 1, dilation 1, pad 1
    SetNDParams(3, 1, 1, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{71, 71, 71},
+                                     std::vector<ck::long_index_t>{71, 71, 71},
                                     "Error: ConvParams 3D stride {1, 1, 1}."));

    // stride 2, dilation 1, pad 2
    SetNDParams(3, 2, 1, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{37, 37, 37},
+                                     std::vector<ck::long_index_t>{37, 37, 37},
                                     "Error: ConvParams 3D padding left/right {2, 2, 2}."));

    // stride 2, dilation 2, pad 2
    SetNDParams(3, 2, 2, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{36, 36, 36},
+                                     std::vector<ck::long_index_t>{36, 36, 36},
                                     "Error: ConvParams 3D dilation {2, 2, 2}."));

    // stride 3, dilation 2, pad 1
@@ -151,6 +153,6 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths3D)
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
        out_spatial_len,
-        std::vector<ck::index_t>{23, 23, 23},
+        std::vector<ck::long_index_t>{23, 23, 23},
        "Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}."));
 }
--- a/test/gemm_universal/test_gemm_universal_util.hpp
+++ b/test/gemm_universal/test_gemm_universal_util.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -25,12 +25,13 @@ class TestGemmUniversal : public testing::Test
    using F32 = float;

    protected:
-    using ALayout   = std::tuple_element_t<0, Tuple>;
-    using BLayout   = std::tuple_element_t<1, Tuple>;
-    using CLayout   = Row;
-    using ADataType = std::tuple_element_t<2, Tuple>;
-    using BDataType = std::tuple_element_t<3, Tuple>;
-    using CDataType = std::tuple_element_t<4, Tuple>;
+    using ALayout         = std::tuple_element_t<0, Tuple>;
+    using BLayout         = std::tuple_element_t<1, Tuple>;
+    using CLayout         = Row;
+    using ADataType       = std::tuple_element_t<2, Tuple>;
+    using BDataType       = std::tuple_element_t<3, Tuple>;
+    using ComputeDataType = std::tuple_element_t<4, Tuple>;
+    using CDataType       = std::tuple_element_t<5, Tuple>;

    public:
    static constexpr bool verify_     = true;
@@ -66,6 +67,7 @@ class TestGemmUniversal : public testing::Test
    {
        bool pass = ck::profiler::profile_gemm_universal_impl<ADataType,
                                                              BDataType,
+                                                              ComputeDataType,
                                                              F32,
                                                              CDataType,
                                                              ALayout,

--- a/test/gemm_universal/test_gemm_universal_xdl.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <tuple>

@@ -41,16 +41,24 @@ class TestGemmUniversal_MK_NK
 };

 // clang-format off
-using KernelTypes = ::testing::Types<
-    //         ADataType, BDataType, CDataType
-    std::tuple<      F16,       F16,       F16>,
-    std::tuple<      F16,        F8,       F16>,
-    std::tuple<      F8,        F16,       F16>,
-    std::tuple<     BF16,      BF16,      BF16>
+using KernelTypes_MK_KN = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<      F16,       F16,             F16,     F16>,
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
+    >;
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, ComputeDataType, CDataType
+    std::tuple<      F16,       F16,             F16,     F16>,
+    std::tuple<      F16,        F8,             F16,     F16>,
+    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<     BF16,      BF16,            BF16,    BF16>,
+    std::tuple<       F8,        F8,              F8,    BF16>
    >;
 // clang-format on

-TYPED_TEST_SUITE(TestGemmUniversal_MK_KN, KernelTypes);
-TYPED_TEST_SUITE(TestGemmUniversal_MK_NK, KernelTypes);
+TYPED_TEST_SUITE(TestGemmUniversal_MK_KN, KernelTypes_MK_KN);
+TYPED_TEST_SUITE(TestGemmUniversal_MK_NK, KernelTypes_MK_NK);

 #include "test_gemm_universal_ut_cases.inc"
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -17,6 +17,7 @@ class TestGroupedConvndFwd : public ::testing::Test
    using InLayout  = std::tuple_element_t<1, Tuple>;
    using WeiLayout = std::tuple_element_t<2, Tuple>;
    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = std::tuple_element_t<4, Tuple>;

    std::vector<ck::utils::conv::ConvParam> conv_params;

@@ -33,7 +34,10 @@ class TestGroupedConvndFwd : public ::testing::Test
                                                                       OutLayout,
                                                                       DataType,
                                                                       DataType,
-                                                                       DataType>(
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       IndexType>(
                               true,  // do_verification
                               1,     // init_method: integer value
                               false, // do_log
@@ -46,30 +50,31 @@ class TestGroupedConvndFwd : public ::testing::Test

 using namespace ck::tensor_layout::convolution;

-using KernelTypes1d = ::testing::Types<std::tuple<float, GNWC, GKXC, GNWK>,
-                                       std::tuple<ck::half_t, GNWC, GKXC, GNWK>,
-                                       std::tuple<ck::bhalf_t, GNWC, GKXC, GNWK>,
-                                       std::tuple<int8_t, GNWC, GKXC, GNWK>>;
-
-using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK>,
-                                       std::tuple<ck::half_t, GNHWC, GKYXC, GNHWK>,
-                                       std::tuple<ck::bhalf_t, GNHWC, GKYXC, GNHWK>,
-                                       std::tuple<int8_t, GNHWC, GKYXC, GNHWK>,
-                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
-                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,
-                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
-                                       std::tuple<int8_t, NHWGC, GKYXC, NHWGK>>;
-
-using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>,
-                                       std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK>,
-                                       std::tuple<ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK>,
-                                       std::tuple<int8_t, GNDHWC, GKZYXC, GNDHWK>,
-                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
-                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,
-                                       std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
-                                       std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK>>;
-
-using KernelTypes2dLargeCases = ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK>>;
+using KernelTypes1d = ::testing::Types<std::tuple<float, GNWC, GKXC, GNWK, ck::index_t>,
+                                       std::tuple<ck::half_t, GNWC, GKXC, GNWK, ck::index_t>,
+                                       std::tuple<ck::bhalf_t, GNWC, GKXC, GNWK, ck::index_t>,
+                                       std::tuple<int8_t, GNWC, GKXC, GNWK, ck::index_t>>;
+
+using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK, ck::index_t>,
+                                       std::tuple<ck::half_t, GNHWC, GKYXC, GNHWK, ck::index_t>,
+                                       std::tuple<ck::bhalf_t, GNHWC, GKYXC, GNHWK, ck::index_t>,
+                                       std::tuple<int8_t, GNHWC, GKYXC, GNHWK, ck::index_t>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK, ck::index_t>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK, ck::index_t>,
+                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK, ck::index_t>,
+                                       std::tuple<int8_t, NHWGC, GKYXC, NHWGK, ck::index_t>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
+                                       std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
+                                       std::tuple<ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
+                                       std::tuple<int8_t, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK, ck::index_t>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK, ck::index_t>,
+                                       std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK, ck::index_t>,
+                                       std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK, ck::index_t>>;
+
+using KernelTypes2dLargeCases =
+    ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK, ck::long_index_t>>;

 template <typename Tuple>
 class TestGroupedConvndFwd1d : public TestGroupedConvndFwd<Tuple>
@@ -153,5 +158,8 @@ TYPED_TEST(TestGroupedConvndFwd2dLargeCases, Test2DLargeCases)
    // With supported NumGroupsToMerge > 1
    this->conv_params.push_back(
        {2, 32, 64, 1, 1, {2, 2}, {672, 672}, {672, 672}, {1, 1}, {0, 0}, {0, 0}});
+    // When image is larger than 2GB
+    this->conv_params.push_back(
+        {2, 1, 1, 256, 256, {3, 3}, {4096, 2048}, {1024, 1024}, {3, 3}, {1, 1}, {1, 1}});
    this->template Run<2>();
 }