Merge branch 'develop' into fa-h512

8b49f207 · Max Podkorytov · GitHub · 0d59f474 · a6b761c3 · 8b49f207
Unverified Commit 8b49f207 authored Jan 07, 2025 by Max Podkorytov Committed by GitHub Jan 07, 2025
20 changed files
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -58,6 +58,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_b_scale.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
@@ -141,6 +142,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
  endif()
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_b_scale_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
@@ -177,5 +179,4 @@ if(DL_KERNELS)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
 endif()
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
--- a/profiler/src/profile_gemm_b_scale.cpp
+++ b/profiler/src/profile_gemm_b_scale.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+#include "profiler/profile_gemm_b_scale_impl.hpp"
+#include "profiler_operation_registry.hpp"
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+    F8_F16_F16,     // 4
+    F16_F8_F16,     // 5
+    F16_F16_F16_F8, // 6
+    F8_F8_BF16,     // 7
+    F16_I4_F16,     // 8
+};
+enum struct BScaleBlockTile
+{
+    K_64,  // 0
+    K_128, // 1
+};
+#define OP_NAME "gemm_b_scale"
+#define OP_DESC "Int4-dequant GEMM"
+int profile_gemm_b_scale(int argc, char* argv[])
+{
+    if(argc != 16 && argc != 19)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
+               "f16->f8; 7: f8->bf16, "
+               "comp f8; 8: f16@i4)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: B scale block tile (0: 64, 1: 128):\n");
+        printf("arg5: verification (0: no; 1: yes)\n");
+        printf("arg6: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg7: print tensor value (0: no; 1: yes)\n");
+        printf("arg8: time kernel (0=no, 1=yes)\n");
+        printf("arg9 to 14: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("arg15: split k into  mulitiple batch\n");
+        printf("optional:\n");
+        printf("arg16: number of warm-up cycles (default 1)\n");
+        printf("arg17: number of iterations (default 10)\n");
+        printf("arg18: memory for rotating buffer (default 0, size in MB)\n");
+        exit(1);
+    }
+    printf("Start profiling\n");
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const auto B_scale_block   = static_cast<BScaleBlockTile>(std::stoi(argv[4]));
+    const bool do_verification = std::stoi(argv[5]);
+    const int init_method      = std::stoi(argv[6]);
+    const bool do_log          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[8]);
+    const int M = std::stoi(argv[9]);
+    const int N = std::stoi(argv[10]);
+    const int K = std::stoi(argv[11]);
+    const int StrideA = std::stoi(argv[12]);
+    const int StrideB = std::stoi(argv[13]);
+    const int StrideC = std::stoi(argv[14]);
+    const int KBatch  = std::stoi(argv[15]);
+    printf("M:%d, N:%d, K:%d, StrideA:%d, StrideB:%d, StrideC:%d, KBatch:%d\n",
+           M,
+           N,
+           K,
+           StrideA,
+           StrideB,
+           StrideC,
+           KBatch);
+    int n_warmup      = 1;
+    int n_iter        = 10;
+    uint64_t rotating = 0;
+    if(argc == 19)
+    {
+        n_warmup = std::stoi(argv[16]);
+        n_iter   = std::stoi(argv[17]);
+        rotating = std::stoull(argv[18]) * 1024 * 1024;
+        printf("n_warmup:%d, n_iter:%d, rotating:%lu\n", n_warmup, n_iter, rotating);
+    }
+    using F32 = float;
+    using F16 = ck::half_t;
+    using I4  = ck::pk_i4_t;
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto b_scale_type,
+                       auto comp_type,
+                       auto acc_type,
+                       auto c_type,
+                       auto scale_block_k,
+                       auto a_layout,
+                       auto b_layout,
+                       auto c_layout) {
+        using ADataType       = decltype(a_type);
+        using BDataType       = decltype(b_type);
+        using BScaleDataType  = decltype(b_scale_type);
+        using ComputeDataType = decltype(comp_type);
+        using AccDataType     = decltype(acc_type);
+        using CDataType       = decltype(c_type);
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using CLayout = decltype(c_layout);
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+        bool pass = ck::profiler::profile_gemm_b_scale_impl<ADataType,
+                                                            BDataType,
+                                                            BScaleDataType,
+                                                            ComputeDataType,
+                                                            AccDataType,
+                                                            CDataType,
+                                                            scale_block_k,
+                                                            ALayout,
+                                                            BLayout,
+                                                            CLayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideC < 0) ? DefaultStrideC : StrideC,
+            KBatch,
+            n_warmup,
+            n_iter,
+            rotating);
+        return pass ? 0 : 1;
+    };
+    if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN &&
+       B_scale_block == BScaleBlockTile::K_128)
+    {
+        printf("F16_I4_F16 MK_NK_MN K_128\n");
+        return profile(
+            F16{}, I4{}, F16{}, F16{}, F32{}, F16{}, ck::Number<128>{}, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+        return 1;
+    }
+}
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_b_scale);
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <initializer_list>
 #include <iostream>
 #include <numeric>
-#include <initializer_list>
-#include <cstdlib>
 #include "profiler/profile_gemm_universal_impl.hpp"
 #include "profiler_operation_registry.hpp"
@@ -27,6 +27,8 @@ enum struct GemmDataType
    F16_F8_F16,     // 5
    F16_F16_F16_F8, // 6
    F8_F8_BF16,     // 7
+    F16_I4_F16,     // 8
+    BF16_I4_BF16,   // 9
 };
 #define OP_NAME "gemm_universal"
@@ -39,7 +41,7 @@ int profile_gemm_universal(int argc, char* argv[])
        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
               "f16->f8; 7: f8->bf16, "
-               "comp f8)\n");
+               "comp f8; 8: f16@i4; 9: bf16@i4\n");
        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
@@ -103,6 +105,7 @@ int profile_gemm_universal(int argc, char* argv[])
    using BF16 = ck::bhalf_t;
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
    using F8 = ck::f8_t;
+    using I4 = ck::pk_i4_t;
 #endif
    using Row = ck::tensor_layout::gemm::RowMajor;
@@ -207,6 +210,14 @@ int profile_gemm_universal(int argc, char* argv[])
    {
        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
    }
+    else if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, I4{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_I4_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(BF16{}, I4{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
 #endif
    else
    {

--- a/profiler/src/profile_gemm_universal_batched.cpp
+++ b/profiler/src/profile_gemm_universal_batched.cpp
@@ -31,7 +31,7 @@ enum struct GemmDataType
 int profile_batched_gemm_universal(int argc, char* argv[])
 {
-    if(argc != 18 && argc != 21)
+    if(argc != 19 && argc != 22)
    {
        // clang-format off
        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
@@ -44,11 +44,11 @@ int profile_batched_gemm_universal(int argc, char* argv[])
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
        printf("arg6: print tensor value (0: no; 1: yes)\n");
        printf("arg7: time kernel (0=n0, 1=yes)\n");
-        printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
+        printf("arg8 to 18: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount, KBatch\n");
        printf("optional:\n");
-        printf("arg18: number of warm-up cycles (default 1)\n");
+        printf("arg19: number of warm-up cycles (default 1)\n");
-        printf("arg19: number of iterations (default 10)\n");
+        printf("arg20: number of iterations (default 10)\n");
-        printf("arg20: memory for rotating buffer (default 0, size in MB)\n");
+        printf("arg21: memory for rotating buffer (default 0, size in MB)\n");
        // clang-format on
        exit(1);
    }
@@ -56,11 +56,11 @@ int profile_batched_gemm_universal(int argc, char* argv[])
    int n_warmup      = 1;
    int n_iter        = 10;
    uint64_t rotating = 0;
-    if(argc == 21)
+    if(argc == 22)
    {
-        n_warmup = std::stoi(argv[18]);
+        n_warmup = std::stoi(argv[19]);
-        n_iter   = std::stoi(argv[19]);
+        n_iter   = std::stoi(argv[20]);
-        rotating = std::stoull(argv[20]) * 1024 * 1024;
+        rotating = std::stoull(argv[21]) * 1024 * 1024;
    }
    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
@@ -83,6 +83,7 @@ int profile_batched_gemm_universal(int argc, char* argv[])
    const int BatchStrideC = std::stoi(argv[16]);
    const int BatchCount = std::stoi(argv[17]);
+    const int KBatch     = std::stoi(argv[18]);
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
    using F8 = ck::f8_t;
@@ -159,6 +160,7 @@ int profile_batched_gemm_universal(int argc, char* argv[])
                                                                                    StrideB_,
                                                                                    StrideC_,
                                                                                    BatchCount,
+                                                                                    KBatch,
                                                                                    n_warmup,
                                                                                    n_iter,
                                                                                    rotating);

--- a/profiler/src/profile_gemm_universal_streamk.cpp
+++ b/profiler/src/profile_gemm_universal_streamk.cpp
@@ -83,8 +83,9 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
        rotating = std::stoull(argv[18]) * 1024 * 1024;
    }
-    using F32 = float;
+    using F32  = float;
-    using F16 = ck::half_t;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
    using F8 = ck::f8_t;
@@ -165,6 +166,22 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
        return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
    }
 #endif
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Col{}, Row{});
+    }
    else
    {
        std::cout << "this data_type & layout is not implemented" << std::endl;

--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -15,9 +15,9 @@ else
 fi
 cmake                                                                                             \
-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+-D CMAKE_PREFIX_PATH=/opt/rocm/                                                                   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
+-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \

--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -82,7 +82,7 @@ def parse_logfile(logfile):
    StrideA=[]
    StrideB=[]
    StrideC=[]
-    if 'perf_gemm.log' in logfile:
+    if 'perf_gemm' in logfile and 'gemm_bilinear' not in logfile:
        for line in open(logfile):
            if 'Best Perf' in line:
                lst=line.split()
@@ -260,7 +260,7 @@ def main():
        conn = sqlEngine.connect()
        #save gemm performance tests:
-        if 'perf_gemm.log' in filename:
+        if 'perf_gemm' in filename and 'gemm_bilinear' not in filename:
            #write the ck_gemm_test_params table only needed once the test set changes
            #post_test_params(test_list,conn)
            for i in range(1,len(results)+1):
@@ -332,7 +332,7 @@ def main():
            table_name="ck_fmha_bwd_tflops"
        tflops_base = get_baseline(table_name,conn)
-        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)
+        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, sqlEngine)
        conn.close()
    #compare the results to the baseline if baseline exists

--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -11,9 +11,22 @@
 #process results
 python3 process_perf_data.py perf_gemm.log
+python3 process_perf_data.py perf_onnx_gemm.log
 python3 process_perf_data.py perf_resnet50_N256.log
 python3 process_perf_data.py perf_resnet50_N4.log
+file=./perf_onnx_gemm_gfx10.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx10.log
+fi
+file=./perf_onnx_gemm_gfx11.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx11.log
+fi
+file=./perf_onnx_gemm_gfx12.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx12.log
+fi
 file=./perf_fmha_fwd_gfx942.log
 if [ -e "$file" ]; then
    python3 process_perf_data.py perf_fmha_fwd_gfx942.log

--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -24,6 +24,18 @@ python3 process_perf_data.py perf_splitK_gemm.log
 python3 process_perf_data.py perf_onnx_gemm.log
 python3 process_perf_data.py perf_mixed_gemm.log
+file=./perf_onnx_gemm_gfx10.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx10.log
+fi
+file=./perf_onnx_gemm_gfx11.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx11.log
+fi
+file=./perf_onnx_gemm_gfx12.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_onnx_gemm_gfx12.log
+fi
 file=./perf_fmha_fwd_gfx942.log
 if [ -e "$file" ]; then
    python3 process_perf_data.py perf_fmha_fwd_gfx942.log

--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -5,7 +5,7 @@
 # post your new test results to the database and compare them to the baseline
 # please contact Illia.Silin@amd.com for more details
 #
-# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
+# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name>
 # input arguments: 
 # verification = 0 : do not verify result correctness on CPU
 #              = 1 : verifuy correctness on CPU (may take a long time)

--- a/script/run_gemm_performance_tests.sh
+++ b/script/run_gemm_performance_tests.sh
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
+# run the script as "./run_gemm_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name> <arch>
+# input arguments: 
+# verification = 0 : do not verify result correctness on CPU
+#              = 1 : verify correctness on CPU (may take a long time)
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# node name        : $hostname
+# arch             : GPU architecture, e.g. "gfx9" or "gfx1100"
+#get the command line arguments:
+export verify=$1
+echo 'Verification: ' $verify
+export env_type=$2
+echo 'Environment type: ' $env_type
+export branch=$3
+echo 'Branch name: ' $branch
+export host_name=$4
+echo 'Host name: ' $host_name
+export arch=$5
+echo 'GPU architecture: ' $arch
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+#run ONNX gemm tests
+export onnx_log="perf_onnx_gemm_$arch.log"
+print_log_header $onnx_log $env_type $branch $host_name
+./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
+./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
 #!/bin/bash 
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
-# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
+# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name>
 # input arguments: 
 # verification = 0 : do not verify result correctness on CPU
 #              = 1 : verify correctness on CPU (may take a long time)
@@ -51,20 +51,11 @@ print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log
 ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log
-#run grouped_fwd fp16 tests
+#run ONNX gemm tests
-export grouped_conv_fwd_log="perf_grouped_conv_fwd_fp16.log"
+export onnx_log="perf_onnx_gemm.log"
-print_log_header $conv_fwd_log $env_type $branch $host_name
+print_log_header $onnx_log $env_type $branch $host_name
-./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
+./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
+./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
-#run grouped_bwd_data fp16 tests
-export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data_fp16.log"
-print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
-./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
-#run grouped_bwd_weight fp16 tests
-export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight_fp16.log"
-print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
-./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 1 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
 #run resnet50 tests
 export resnet256_log="perf_resnet50_N256.log"

--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -24,12 +24,9 @@ class TestCkTileBatchedGemm : public ::testing::Test
    using AccDataType = std::tuple_element_t<5, Tuple>;
    using CDataType   = std::tuple_element_t<6, Tuple>;
-    struct batched_gemm_kargs : public ck_tile::BatchedGemmHostArgs
-    {
-    };
    template <typename ALayout, typename BLayout, typename CLayout>
-    void invoke_batched_gemm(const batched_gemm_kargs& args, const ck_tile::stream_config& s)
+    void invoke_batched_gemm(const ck_tile::BatchedGemmHostArgs& args,
+                             const ck_tile::stream_config& s)
    {
        // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
        constexpr bool kPadM        = false;
@@ -94,9 +91,9 @@ class TestCkTileBatchedGemm : public ::testing::Test
        using Kernel =
            ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
-        auto kargs = Kernel::MakeKargs(args);
+        auto kargs = Kernel::MakeKernelArgs(args);
-        const dim3 grids      = Kernel::GridSize(args);
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
        constexpr dim3 blocks = Kernel::BlockSize();
        if(s.log_level_ > 0)
@@ -185,21 +182,23 @@ class TestCkTileBatchedGemm : public ::testing::Test
        c_m_n_dev_buf.SetZero();
        c_m_n_dev_result.SetZero();
-        batched_gemm_kargs kargs{a_m_k_dev_buf.GetDeviceBuffer(),
+        ck_tile::BatchedGemmHostArgs args;
-                                 b_k_n_dev_buf.GetDeviceBuffer(),
+        args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
-                                 c_m_n_dev_buf.GetDeviceBuffer(),
+        args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
-                                 M,
+        args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
-                                 N,
+        args.k_batch        = 1;
-                                 K,
+        args.M              = M;
-                                 StrideA,
+        args.N              = N;
-                                 StrideB,
+        args.K              = K;
-                                 StrideC,
+        args.stride_A       = StrideA;
-                                 BatchStrideA,
+        args.stride_B       = StrideB;
-                                 BatchStrideB,
+        args.stride_C       = StrideC;
-                                 BatchStrideC,
+        args.batch_stride_A = BatchStrideA;
-                                 BatchCount};
+        args.batch_stride_B = BatchStrideB;
+        args.batch_stride_C = BatchStrideC;
-        invoke_batched_gemm<ALayout, BLayout, CLayout>(kargs,
+        args.batch_count    = BatchCount;
+        invoke_batched_gemm<ALayout, BLayout, CLayout>(args,
                                                       ck_tile::stream_config{nullptr, false});
        std::cout << "Run kernel with M =" << M << " N =" << N << " K =" << K

--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
 # Currently ck_tile is only built on gfx9
 if(GPU_TARGETS MATCHES "gfx9")
-    add_gtest_executable(test_ck_tile_gemm_mem_pipeline test_gemm_mem_pipeline.cpp)
+    add_gtest_executable(test_ck_tile_gemm_pipeline test_gemm_pipeline.cpp)
 endif()
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
@@ -6,7 +6,7 @@
 #include "gtest/gtest.h"
 #include "ck_tile/host.hpp"
-#include "test_gemm_mem_pipeline_util.hpp"
+#include "test_gemm_pipeline_util.hpp"
 using F16       = ck_tile::half_t;
 using F32       = float;
@@ -16,21 +16,27 @@ using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
                                             ck_tile::GemmPipelineScheduler::Intrawave>;
 using Interwave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
                                             ck_tile::GemmPipelineScheduler::Interwave>;
+using Mem       = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
+using Comp      = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Comp>;
 // clang-format off
 using KernelTypes = ::testing::Types<
-    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler, PipelineType
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16,             Interwave>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16,             Interwave>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16,             Interwave>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16,             Interwave>
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>
    >;
 // clang-format on
-TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes);
+TYPED_TEST_SUITE(TestCkTileGemmPipeline, KernelTypes);
-#include "test_gemm_mem_pipeline_ut_cases.inc"
+#include "test_gemm_pipeline_ut_cases.inc"
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
@@ -3,7 +3,7 @@
 #pragma once
-TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
+TYPED_TEST(TestCkTileGemmPipeline, SmallM)
 {
    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 1024;
@@ -13,7 +13,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
        this->Run(M, N, K);
 }
-TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
+TYPED_TEST(TestCkTileGemmPipeline, MidLargeM)
 {
    std::vector<int> Ms{127, 255, 312, 799, 1573};
    constexpr int N = 1024;
@@ -23,7 +23,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
        this->Run(M, N, K);
 }
-TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
+TYPED_TEST(TestCkTileGemmPipeline, PaddK)
 {
    std::vector<int> Ms{127};
    constexpr int N = 1024;
@@ -33,7 +33,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
        this->Run(M, N, K);
 }
-TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
+TYPED_TEST(TestCkTileGemmPipeline, Regular)
 {
    std::vector<int> Ms{512};
    constexpr int N = 1024;
@@ -43,7 +43,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
        this->Run(M, N, K);
 }
-TYPED_TEST(TestCkTileGemmMemPipeline, NotSupportedArgument)
+TYPED_TEST(TestCkTileGemmPipeline, NotSupportedArgument)
 {
    constexpr int M = 512;
    constexpr int N = 1025;

--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
@@ -11,36 +11,28 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
+enum struct GemmPipelineType
+{
+    Mem,
+    Comp
+};
 template <typename Tuple>
-class TestCkTileGemmMemPipeline : public ::testing::Test
+class TestCkTileGemmPipeline : public ::testing::Test
 {
    protected:
-    using ALayout                   = std::tuple_element_t<0, Tuple>;
+    using ALayout                      = std::tuple_element_t<0, Tuple>;
-    using BLayout                   = std::tuple_element_t<1, Tuple>;
+    using BLayout                      = std::tuple_element_t<1, Tuple>;
-    using CLayout                   = std::tuple_element_t<2, Tuple>;
+    using CLayout                      = std::tuple_element_t<2, Tuple>;
-    using ADataType                 = std::tuple_element_t<3, Tuple>;
+    using ADataType                    = std::tuple_element_t<3, Tuple>;
-    using BDataType                 = std::tuple_element_t<4, Tuple>;
+    using BDataType                    = std::tuple_element_t<4, Tuple>;
-    using AccDataType               = std::tuple_element_t<5, Tuple>;
+    using AccDataType                  = std::tuple_element_t<5, Tuple>;
-    using CDataType                 = std::tuple_element_t<6, Tuple>;
+    using CDataType                    = std::tuple_element_t<6, Tuple>;
-    static constexpr auto Scheduler = std::tuple_element_t<7, Tuple>::value;
+    static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
+    static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
    // TODO: expose tile size through test t-param ?
-    struct gemm_args
-    {
-        const void* p_a;
-        const void* p_b;
-        void* p_c;
-        ck_tile::index_t kbatch;
-        ck_tile::index_t M;
-        ck_tile::index_t N;
-        ck_tile::index_t K;
-        ck_tile::index_t stride_A;
-        ck_tile::index_t stride_B;
-        ck_tile::index_t stride_C;
-    };
    template <bool PadM, bool PadN, bool PadK>
-    void invoke_gemm(const gemm_args& args, const ck_tile::stream_config& s)
+    void invoke_gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
    {
        // TODO: This should be parameterized in tests
        constexpr ck_tile::index_t M_Tile = 128;
@@ -74,10 +66,17 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<
+        using BaseGemmPipeline = std::conditional_t<
-            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
+            PipelineType == GemmPipelineType::Mem,
+            ck_tile::BaseGemmPipelineAgBgCrMem<
-        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(args.K);
+                ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>,
+            ck_tile::BaseGemmPipelineAgBgCrCompV3<
+                ck_tile::
+                    GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>>;
+        const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
@@ -85,27 +84,30 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
            constexpr bool has_hot_loop_v = has_hot_loop_.value;
            constexpr auto tail_number_v  = tail_number_.value;
-            using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<
+            using GemmPipeline =
-                ck_tile::UniversalGemmPipelineProblem<ADataType,
+                std::conditional_t<PipelineType == GemmPipelineType::Mem,
-                                                      BDataType,
+                                   ck_tile::GemmPipelineAgBgCrMem<
-                                                      AccDataType,
+                                       ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                      GemmShape,
+                                                                             BDataType,
-                                                      Traits,
+                                                                             AccDataType,
-                                                      Scheduler,
+                                                                             GemmShape,
-                                                      has_hot_loop_v,
+                                                                             Traits,
-                                                      tail_number_v>>;
+                                                                             Scheduler,
+                                                                             has_hot_loop_v,
+                                                                             tail_number_v>>,
+                                   ck_tile::GemmPipelineAgBgCrCompV3<
+                                       ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                             BDataType,
+                                                                             AccDataType,
+                                                                             GemmShape,
+                                                                             Traits,
+                                                                             Scheduler,
+                                                                             has_hot_loop_v,
+                                                                             tail_number_v>>>;
            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-            auto kargs   = Kernel::MakeKargs(args.p_a,
+            auto kargs   = Kernel::MakeKernelArgs(args);
-                                           args.p_b,
-                                           args.p_c,
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-                                           args.M,
-                                           args.N,
-                                           args.K,
-                                           args.stride_A,
-                                           args.stride_B,
-                                           args.stride_C);
-            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.kbatch);
            constexpr dim3 blocks = Kernel::BlockSize();
            if(!Kernel::IsSupportedArgument(kargs))
@@ -297,11 +299,11 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
        c_m_n_dev_buf.SetZero();
        c_m_n_dev_result.SetZero();
-        gemm_args args;
+        ck_tile::GemmHostArgs args;
-        args.p_a      = a_m_k_dev_buf.GetDeviceBuffer();
+        args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
-        args.p_b      = b_k_n_dev_buf.GetDeviceBuffer();
+        args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
-        args.p_c      = c_m_n_dev_buf.GetDeviceBuffer();
+        args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
-        args.kbatch   = kbatch;
+        args.k_batch  = kbatch;
        args.M        = M;
        args.N        = N;
        args.K        = K;

--- a/test/data_type/test_custom_type.cpp
+++ b/test/data_type/test_custom_type.cpp
@@ -51,8 +51,11 @@ TEST(Custom_bool, TestAsType)
    ck::static_for<0, size, 1>{}([&](auto i) {
        right_vec.template AsType<custom_bool_t>()(Number<i>{}) = custom_bool_t{test_vec.at(i)};
    });
-    // copy the vector
+    vector_type<custom_bool_t, size> left_vec;
-    vector_type<custom_bool_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_bool_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<custom_bool_t>()(Number<i>{}).data, test_vec.at(i));
@@ -129,8 +132,11 @@ TEST(Custom_int8, TestAsType)
    ck::static_for<0, size, 1>{}([&](auto i) {
        right_vec.template AsType<custom_int8_t>()(Number<i>{}) = custom_int8_t{test_vec.at(i)};
    });
-    // copy the vector
+    vector_type<custom_int8_t, size> left_vec;
-    vector_type<custom_int8_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_int8_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<custom_int8_t>()(Number<i>{}).data, test_vec.at(i));
@@ -207,8 +213,11 @@ TEST(Custom_uint8, TestAsType)
    ck::static_for<0, size, 1>{}([&](auto i) {
        right_vec.template AsType<custom_uint8_t>()(Number<i>{}) = custom_uint8_t{test_vec.at(i)};
    });
-    // copy the vector
+    vector_type<custom_uint8_t, size> left_vec;
-    vector_type<custom_uint8_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_uint8_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<custom_uint8_t>()(Number<i>{}).data, test_vec.at(i));
@@ -287,8 +296,11 @@ TEST(Custom_f8, TestAsType)
    ck::static_for<0, size, 1>{}([&](auto i) {
        right_vec.template AsType<custom_f8_t>()(Number<i>{}) = custom_f8_t{test_vec.at(i)};
    });
-    // copy the vector
+    vector_type<custom_f8_t, size> left_vec;
-    vector_type<custom_f8_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_f8_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<custom_f8_t>()(Number<i>{}).data, test_vec.at(i));
@@ -369,8 +381,11 @@ TEST(Custom_bf8, TestAsType)
    ck::static_for<0, size, 1>{}([&](auto i) {
        right_vec.template AsType<custom_bf8_t>()(Number<i>{}) = custom_bf8_t{test_vec.at(i)};
    });
-    // copy the vector
+    vector_type<custom_bf8_t, size> left_vec;
-    vector_type<custom_bf8_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_bf8_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<custom_bf8_t>()(Number<i>{}).data, test_vec.at(i));
@@ -450,8 +465,11 @@ TEST(Custom_half, TestAsType)
    ck::static_for<0, size, 1>{}([&](auto i) {
        right_vec.template AsType<custom_half_t>()(Number<i>{}) = custom_half_t{test_vec.at(i)};
    });
-    // copy the vector
+    vector_type<custom_half_t, size> left_vec;
-    vector_type<custom_half_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_half_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<custom_half_t>()(Number<i>{}).data, test_vec.at(i));
@@ -533,8 +551,11 @@ TEST(Custom_bhalf, TestAsType)
    ck::static_for<0, size, 1>{}([&](auto i) {
        right_vec.template AsType<custom_bhalf_t>()(Number<i>{}) = custom_bhalf_t{test_vec.at(i)};
    });
-    // copy the vector
+    vector_type<custom_bhalf_t, size> left_vec;
-    vector_type<custom_bhalf_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_bhalf_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<custom_bhalf_t>()(Number<i>{}).data, test_vec.at(i));
@@ -615,8 +636,11 @@ TEST(Custom_float, TestAsType)
    ck::static_for<0, size, 1>{}([&](auto i) {
        right_vec.template AsType<custom_float_t>()(Number<i>{}) = custom_float_t{test_vec.at(i)};
    });
-    // copy the vector
+    vector_type<custom_float_t, size> left_vec;
-    vector_type<custom_float_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_float_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<custom_float_t>()(Number<i>{}).data, test_vec.at(i));
@@ -693,8 +717,11 @@ TEST(Custom_double, TestAsType)
    ck::static_for<0, size, 1>{}([&](auto i) {
        right_vec.template AsType<custom_double_t>()(Number<i>{}) = custom_double_t{test_vec.at(i)};
    });
-    // copy the vector
+    vector_type<custom_double_t, size> left_vec;
-    vector_type<custom_double_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<custom_double_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<custom_double_t>()(Number<i>{}).data, test_vec.at(i));
@@ -813,8 +840,11 @@ TEST(Complex_half, TestAsType)
        right_vec.template AsType<complex_half_t>()(Number<i>{}) =
            complex_half_t{test_vec.at(num_elem * i), test_vec.at(num_elem * i + 1)};
    });
-    // copy the vector
+    vector_type<complex_half_t, size> left_vec;
-    vector_type<complex_half_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<complex_half_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
        ASSERT_EQ(left_vec.template AsType<complex_half_t>()(Number<i>{}).real,
@@ -907,8 +937,11 @@ TEST(FP8OCP, TestAsType)
        right_vec.template AsType<f8_t>()(Number<i>{}) = ck::type_convert<f8_t>(test_vec.at(i));
    });
-    // copy the vector
+    vector_type<f8_t, size> left_vec;
-    vector_type<f8_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<f8_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {
@@ -984,8 +1017,11 @@ TEST(BF8OCP, TestAsType)
        right_vec.template AsType<bf8_t>()(Number<i>{}) = ck::type_convert<bf8_t>(test_vec.at(i));
    });
-    // copy the vector
    vector_type<bf8_t, size> left_vec{right_vec};
+    // check copy assignment op
+    left_vec = right_vec;
+    // overwrite right_vec with 0s
+    right_vec = vector_type<bf8_t, size>{};
    // check if values were copied correctly
    ck::static_for<0, size, 1>{}([&](auto i) {

--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
-add_gtest_executable(test_grouped_convnd_bwd_data test_grouped_convnd_bwd_data_xdl_wmma.cpp)
+add_gtest_executable(test_grouped_convnd_bwd_data_xdl test_grouped_convnd_bwd_data_xdl.cpp)
 if(result EQUAL 0)
-    target_link_libraries(test_grouped_convnd_bwd_data PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+    target_link_libraries(test_grouped_convnd_bwd_data_xdl PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
+endif()
+add_gtest_executable(test_grouped_convnd_bwd_data_wmma test_grouped_convnd_bwd_data_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_grouped_convnd_bwd_data_wmma PRIVATE utility device_grouped_conv2d_bwd_data_instance device_grouped_conv3d_bwd_data_instance)
 endif()
 add_gtest_executable(test_grouped_convnd_bwd_data_interface_xdl test_grouped_convnd_bwd_data_interface_xdl.cpp)
 if(result EQUAL 0)

--- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
+++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_wmma.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+#include "profiler/profile_grouped_conv_bwd_data_impl.hpp"
+template <typename Tuple>
+class TestGroupedConvndBwdDataWmma : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using OutLayout = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using InLayout  = std::tuple_element_t<3, Tuple>;
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
+                                                                            OutLayout,
+                                                                            WeiLayout,
+                                                                            InLayout,
+                                                                            DataType,
+                                                                            DataType,
+                                                                            DataType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+using namespace ck::tensor_layout::convolution;
+using KernelTypes2d = ::testing::Types<std::tuple<ck::half_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<int8_t, GNHWK, GKYXC, GNHWC>,
+                                       std::tuple<ck::half_t, NHWGK, GKYXC, NHWGC>,
+                                       std::tuple<int8_t, NHWGK, GKYXC, NHWGC>>;
+using KernelTypes3d = ::testing::Types<std::tuple<ck::half_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<int8_t, GNDHWK, GKZYXC, GNDHWC>,
+                                       std::tuple<ck::half_t, NDHWGK, GKZYXC, NDHWGC>,
+                                       std::tuple<int8_t, NDHWGK, GKZYXC, NDHWGC>>;
+template <typename Tuple>
+class TestGroupedConvndBwdDataWmma2d : public TestGroupedConvndBwdDataWmma<Tuple>
+{
+};
+template <typename Tuple>
+class TestGroupedConvndBwdDataWmma3d : public TestGroupedConvndBwdDataWmma<Tuple>
+{
+};
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndBwdDataWmma3d, KernelTypes3d);
+TYPED_TEST(TestGroupedConvndBwdDataWmma2d, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 1, 1, 32, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 64, 3, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 1, 1, 1, {8, 8}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+TYPED_TEST(TestGroupedConvndBwdDataWmma3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 2, 16, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 2, 2, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->template Run<3>();
+}