Merge branch 'amd-develop' into amd-master

72c9f129 · Jun Liu · 241c261f · ded0d83d · 72c9f129 · 72c9f129
Commit 72c9f129 authored Sep 20, 2024 by Jun Liu
15 changed files
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -122,7 +122,7 @@ def parse_logfile(logfile):
        #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
        test_list=list(range(1,len(tests)+1))
    #parse conv_fwd and conv_bwd performance tests:
-    elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile:
+    elif 'conv_fwd' in logfile or 'conv_bwd' in logfile:
        for line in open(logfile):
            if 'tflops:' in line:
                lst=line.split()
@@ -274,14 +274,26 @@ def main():
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_grouped_gemm_tflops"
-        if 'conv_fwd' in filename:
+        if 'perf_conv_fwd' in filename:
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_conv_fwd_tflops"
-        if 'conv_bwd_data' in filename:
+        if 'perf_conv_bwd_data' in filename:
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_conv_bwd_data_tflops"
+        if 'grouped_conv_fwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_conv_fwd_tflops"
+        if 'grouped_conv_bwd_data' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_conv_bwd_data_tflops"
+        if 'grouped_conv_bwd_weight' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_conv_bwd_weight_tflops"
        if 'gemm_bilinear' in filename:
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)

--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -15,8 +15,9 @@ python3 process_perf_data.py perf_resnet50_N256.log
 python3 process_perf_data.py perf_resnet50_N4.log
 python3 process_perf_data.py perf_batched_gemm.log
 python3 process_perf_data.py perf_grouped_gemm.log
-python3 process_perf_data.py perf_conv_fwd.log
-python3 process_perf_data.py perf_conv_bwd_data.log
+python3 process_perf_data.py perf_grouped_conv_fwd.log
+python3 process_perf_data.py perf_grouped_conv_bwd_data.log
+python3 process_perf_data.py perf_grouped_conv_bwd_weight.log
 python3 process_perf_data.py perf_gemm_bilinear.log
 python3 process_perf_data.py perf_reduction.log
 python3 process_perf_data.py perf_splitK_gemm.log

--- a/script/profile_conv_bwd_data.sh
+++ b/script/profile_conv_bwd_data.sh
--- a/script/profile_conv_fwd.sh
+++ b/script/profile_conv_fwd.sh
@@ -12,27 +12,28 @@ INIT=$5
 LOG=$6
 TIME=$7

- N=$8
+N=$8
+SplitK=$9

 # Resnet50
 ########  op  datatype  layout  verify  init  log  time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3 $SplitK
--- a/script/profile_grouped_conv_fwd.sh
+++ b/script/profile_grouped_conv_fwd.sh
+#!/bin/bash
+ 
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+INDEXTYPE=$4
+VERIFY=$5
+INIT=$6
+LOG=$7
+TIME=$8
+
+ N=$9
+
+# Resnet50
+########  op  datatype  indextype  layout  verify  init  log  time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3
--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -90,21 +90,27 @@ print_log_header $gemm_bilinear_log $env_type $branch $host_name
 ./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
 ./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log

-#run conv_fwd tests
-export conv_fwd_log="perf_conv_fwd.log"
-print_log_header $conv_fwd_log $env_type $branch $host_name
-./profile_conv_fwd.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
-./profile_conv_fwd.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
-./profile_conv_fwd.sh conv_fwd 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
-./profile_conv_fwd.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
+#run grouped_fwd tests
+export grouped_conv_fwd_log="perf_grouped_conv_fwd.log"
+print_log_header $grouped_conv_fwd_log $env_type $branch $host_name
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 0 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 2 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log

-#run conv_bwd_data tests
-export conv_bwd_data_log="perf_conv_bwd_data.log"
-print_log_header $conv_bwd_data_log $env_type $branch $host_name
-./profile_conv_bwd_data.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
-./profile_conv_bwd_data.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
-./profile_conv_bwd_data.sh conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
-./profile_conv_bwd_data.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+#run grouped_bwd_data tests
+export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data.log"
+print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
+
+#run grouped_bwd_weight tests
+export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight.log"
+print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 0 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 2 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 4 2>&1 | tee -a $grouped_conv_bwd_weight_log

 #run resnet50 tests
 export resnet256_log="perf_resnet50_N256.log"

--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
@@ -51,6 +51,21 @@ print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log
 ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log

+#run grouped_fwd fp16 tests
+export grouped_conv_fwd_log="perf_grouped_conv_fwd_fp16.log"
+print_log_header $conv_fwd_log $env_type $branch $host_name
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
+
+#run grouped_bwd_data fp16 tests
+export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data_fp16.log"
+print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
+
+#run grouped_bwd_weight fp16 tests
+export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight_fp16.log"
+print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 1 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+
 #run resnet50 tests
 export resnet256_log="perf_resnet50_N256.log"
 print_log_header $resnet256_log $env_type $branch $host_name

--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
+# temporarily disable flaky test for all architectures
+add_definitions(-DCK_SKIP_FLAKY_F8_TEST)
+set(CK_SKIP_FLAKY_F8_TEST "ON")
+
 if (USE_BITINT_EXTENSION_INT4)
  add_gtest_executable(test_int4 test_int4.cpp)
  if(result EQUAL 0)

--- a/test/data_type/test_bf8.cpp
+++ b/test/data_type/test_bf8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"

 using ck::bf8_t;
+using ck::f8_convert_rne;
 using ck::f8_convert_sr;
 using ck::half_t;
 using ck::type_convert;
@@ -24,33 +25,36 @@ TEST(BF8, ConvertFP32Nearest)
    // fix the tolerance value
    float abs_tol = 1e-6;
    // convert 0 float to bf8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(type_convert<bf8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<bf8_t>(0.0f)), abs_tol);
+    // don't run the next test on gfx11 devices
+#ifndef CK_SKIP_FLAKY_F8_TEST
    // convert minimal float to bf8 and back, check if holds
    ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(type_convert<bf8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_rne<bf8_t>(std::numeric_limits<float>::min())),
                abs_tol);
+#endif
    // convert maximal bf8_t to float and check if equal to 57344.0
-    ASSERT_NEAR(57344.0f, type_convert<float>(type_convert<bf8_t>(57344.0f)), abs_tol);
+    ASSERT_NEAR(57344.0f, type_convert<float>(f8_convert_rne<bf8_t>(57344.0f)), abs_tol);
    // convert maximal float to bf8 and back, check if clipped to 57344.0
    ASSERT_NEAR(57344.0f,
-                type_convert<float>(type_convert<bf8_t>(std::numeric_limits<float>::max())),
+                type_convert<float>(f8_convert_rne<bf8_t>(std::numeric_limits<float>::max())),
                abs_tol);
    // convert inf float to bf8_t and check if it is qNan
    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                type_convert<bf8_t>(std::numeric_limits<float>::infinity()),
+                f8_convert_rne<bf8_t>(std::numeric_limits<float>::infinity()),
                abs_tol);
    // positive norm float value to bf8 and back, check if holds
    float pos_float = 0.0000762939f;
-    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_t>(pos_float)), abs_tol);
    // negative norm float value to bf8 and back, check if holds
    float neg_float = -0.0000610351f;
-    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_t>(neg_float)), abs_tol);
    // positive subnorm float value to bf8 and back, check if holds
    pos_float = 0.0000305175f;
-    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_t>(pos_float)), abs_tol);
    // negative subnorm float value to bf8 and back, check if holds
    neg_float = -0.0000152587f;
-    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_t>(neg_float)), abs_tol);
 }

 TEST(BF8, ConvertFP32Stochastic)
@@ -92,34 +96,34 @@ TEST(BF8, ConvertFP16Nearest)
    // fix the tolerance value
    float abs_tol = 1e-3;
    // convert 0 fp16 to bf8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(type_convert<bf8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_rne<bf8_t>(half_t{0.0})), abs_tol);
    // convert minimal fp16 to bf8 and back, check if holds
    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(type_convert<bf8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::Min())),
                abs_tol);
    // convert maximal bf8_t to fp16 and check if equal to 57344.0
    ASSERT_NEAR(
-        half_t{57344.0}, type_convert<half_t>(type_convert<bf8_t>(half_t{57344.0})), abs_tol);
+        half_t{57344.0}, type_convert<half_t>(f8_convert_rne<bf8_t>(half_t{57344.0})), abs_tol);
    // convert maximal fp16 to bf8 and back, check if clipped to 57344.0
    ASSERT_NEAR(half_t{57344.0},
-                type_convert<half_t>(type_convert<bf8_t>(ck::NumericLimits<half_t>::Max())),
+                type_convert<half_t>(f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::Max())),
                abs_tol);
    // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN
    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                type_convert<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+                f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
                abs_tol);
    // positive norm fp16 value to bf8 and back, check if holds
    half_t pos_half = half_t{0.0000762939};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_t>(pos_half)), abs_tol);
    // negative norm fp16 value to bf8 and back, check if holds
    half_t neg_half = half_t{-0.0000610351};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_t>(neg_half)), abs_tol);
    // positive subnorm fp16 value to bf8 and back, check if holds
    pos_half = half_t{0.0000305175};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_t>(pos_half)), abs_tol);
    // negative subnorm fp16 value to bf8 and back, check if holds
    neg_half = half_t{-0.0000152587};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_t>(neg_half)), abs_tol);
 }

 TEST(BF8, ConvertFP16Stochastic)

--- a/test/data_type/test_fp8.cpp
+++ b/test/data_type/test_fp8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"

+using ck::f8_convert_rne;
 using ck::f8_convert_sr;
 using ck::f8_t;
 using ck::half_t;
@@ -24,33 +25,36 @@ TEST(FP8, ConvertFP32Nearest)
    // fix the tolerance value
    float abs_tol = 1e-6;
    // convert 0 float to fp8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(type_convert<f8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<f8_t>(0.0f)), abs_tol);
+    // don't run the next test on gfx11 devices
+#ifndef CK_SKIP_FLAKY_F8_TEST
    // convert minimal float to fp8 and back, check if holds
    ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(type_convert<f8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_rne<f8_t>(std::numeric_limits<float>::min())),
                abs_tol);
+#endif
    // convert maximal f8_t to float and check if equal to 240.0
-    ASSERT_NEAR(240.0f, type_convert<float>(type_convert<f8_t>(240.0f)), abs_tol);
+    ASSERT_NEAR(240.0f, type_convert<float>(f8_convert_rne<f8_t>(240.0f)), abs_tol);
    // convert maximal float to fp8 and back, check if clipped to 240.0
    ASSERT_NEAR(240.0f,
-                type_convert<float>(type_convert<f8_t>(std::numeric_limits<float>::max())),
+                type_convert<float>(f8_convert_rne<f8_t>(std::numeric_limits<float>::max())),
                abs_tol);
    // convert inf float to f8_t and check if it is qNan
    ASSERT_NEAR(type_convert<f8_t>(0x80),
-                type_convert<f8_t>(std::numeric_limits<float>::infinity()),
+                f8_convert_rne<f8_t>(std::numeric_limits<float>::infinity()),
                abs_tol);
    // positive norm float value to fp8 and back, check if holds
    float pos_float = 0.017578125f;
-    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<f8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_t>(pos_float)), abs_tol);
    // negative norm float value to fp8 and back, check if holds
    float neg_float = -0.015625f;
-    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<f8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_t>(neg_float)), abs_tol);
    // positive subnorm float value to fp8 and back, check if holds
    pos_float = 0.00390625f;
-    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<f8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_t>(pos_float)), abs_tol);
    // negative subnorm float value to fp8 and back, check if holds
    neg_float = -0.001953125f;
-    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<f8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_t>(neg_float)), abs_tol);
 }

 TEST(FP8, ConvertFP32Stochastic)
@@ -92,33 +96,33 @@ TEST(FP8, ConvertFP16Nearest)
    // fix the tolerance value
    float abs_tol = 1e-3;
    // convert 0 fp16 to fp8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(type_convert<f8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_rne<f8_t>(half_t{0.0})), abs_tol);
    // convert minimal fp16 to fp8 and back, check if holds
    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(type_convert<f8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_rne<f8_t>(ck::NumericLimits<half_t>::Min())),
                abs_tol);
    // convert maximal f8_t to fp16 and check if equal to 240.0
-    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(type_convert<f8_t>(half_t{240.0})), abs_tol);
+    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(f8_convert_rne<f8_t>(half_t{240.0})), abs_tol);
    // convert maximal fp16 to fp8 and back, check if clipped to 240.0
    ASSERT_NEAR(half_t{240.0},
-                type_convert<half_t>(type_convert<f8_t>(ck::NumericLimits<half_t>::Max())),
+                type_convert<half_t>(f8_convert_rne<f8_t>(ck::NumericLimits<half_t>::Max())),
                abs_tol);
    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
    ASSERT_NEAR(type_convert<f8_t>(0x80),
-                type_convert<f8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+                f8_convert_rne<f8_t>(ck::NumericLimits<half_t>::QuietNaN()),
                abs_tol);
    // positive norm fp16 value to fp8 and back, check if holds
    half_t pos_half = half_t{0.017578125};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<f8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<f8_t>(pos_half)), abs_tol);
    // negative norm fp16 value to fp8 and back, check if holds
    half_t neg_half = half_t{-0.015625};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<f8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<f8_t>(neg_half)), abs_tol);
    // positive subnorm fp16 value to fp8 and back, check if holds
    pos_half = half_t{0.00390625};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<f8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<f8_t>(pos_half)), abs_tol);
    // negative subnorm fp16 value to fp8 and back, check if holds
    neg_half = half_t{-0.001953125};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<f8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<f8_t>(neg_half)), abs_tol);
 }

 TEST(FP8, ConvertFP16Stochastic)

--- a/test/gemm_universal/test_gemm_universal_xdl.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl.cpp
@@ -44,17 +44,22 @@ class TestGemmUniversal_MK_NK
 using KernelTypes_MK_KN = ::testing::Types<
    //         ADataType, BDataType, ComputeDataType, CDataType
    std::tuple<      F16,       F16,             F16,     F16>,
+#if (defined CK_ENABLE_FP8)
    std::tuple<      F16,        F8,             F16,     F16>,
    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
    std::tuple<     BF16,      BF16,            BF16,    BF16>
    >;
 using KernelTypes_MK_NK = ::testing::Types<
    //         ADataType, BDataType, ComputeDataType, CDataType
    std::tuple<      F16,       F16,             F16,     F16>,
+#if (defined CK_ENABLE_FP8)
    std::tuple<      F16,        F8,             F16,     F16>,
    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<     BF16,      BF16,            BF16,    BF16>,
-    std::tuple<       F8,        F8,              F8,    BF16>
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
    >;
 // clang-format on


--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -66,6 +66,12 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
            {
                return true;
            }
+            // Skip due to the lack of kernels for NGCDHW
+            if constexpr(std::is_same_v<InLayout, NGCW> || std::is_same_v<InLayout, NGCHW> ||
+                         std::is_same_v<InLayout, NGCDHW>)
+            {
+                return true;
+            }
        }
        else
        {
@@ -139,7 +145,8 @@ using KernelTypes2d = ::testing::Types<
    std::tuple<ck::bhalf_t, float, ck::bhalf_t, GNHWC, GKYXC, GNHWK, ck::Number<2>>,
    std::tuple<float, float, float, NHWGC, GKYXC, NHWGK, ck::Number<2>>,
    std::tuple<ck::half_t, ck::half_t, ck::half_t, NHWGC, GKYXC, NHWGK, ck::Number<2>>,
-    std::tuple<ck::bhalf_t, float, ck::bhalf_t, NHWGC, GKYXC, NHWGK, ck::Number<2>>>;
+    std::tuple<ck::bhalf_t, float, ck::bhalf_t, NHWGC, GKYXC, NHWGK, ck::Number<2>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCHW, GKYXC, NGKHW, ck::Number<2>>>;
 using KernelTypes3d = ::testing::Types<
    std::tuple<float, float, float, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
    std::tuple<ck::half_t, ck::half_t, ck::half_t, GNDHWC, GKZYXC, GNDHWK, ck::Number<3>>,
@@ -148,7 +155,8 @@ using KernelTypes3d = ::testing::Types<
    std::tuple<float, float, float, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
    std::tuple<ck::half_t, ck::half_t, ck::half_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
    std::tuple<ck::bhalf_t, float, ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
-    std::tuple<int8_t, int8_t, int8_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>>;
+    std::tuple<int8_t, int8_t, int8_t, NDHWGC, GKZYXC, NDHWGK, ck::Number<3>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, NGCDHW, GKZYXC, NGKDHW, ck::Number<3>>>;

 TYPED_TEST_SUITE(TestGroupedConvndBwdWeight1d, KernelTypes1d);
 TYPED_TEST_SUITE(TestGroupedConvndBwdWeight2d, KernelTypes2d);

--- a/test/grouped_convnd_fwd/CMakeLists.txt
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
@@ -7,6 +7,12 @@ if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11")
    endif()
 endif()

+if(GPU_TARGETS MATCHES "gfx9")
+    add_executable(test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_fwd_large_cases_xdl.cpp)
+    target_compile_options(test_grouped_convnd_fwd_large_cases_xdl PRIVATE -Wno-global-constructors -Wno-undef)
+    target_link_libraries(test_grouped_convnd_fwd_large_cases_xdl PRIVATE gtest_main getopt::getopt utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
+endif()
+
 add_gtest_executable(test_grouped_convnd_fwd_multi_ab_interface test_grouped_convnd_fwd_multi_ab_interface.cpp)
 if(result EQUAL 0)
    target_link_libraries(test_grouped_convnd_fwd_multi_ab_interface PRIVATE utility)

--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -17,7 +17,7 @@ class TestGroupedConvndFwd : public ::testing::Test
    using InLayout  = std::tuple_element_t<1, Tuple>;
    using WeiLayout = std::tuple_element_t<2, Tuple>;
    using OutLayout = std::tuple_element_t<3, Tuple>;
-    using IndexType = std::tuple_element_t<4, Tuple>;
+    using IndexType = ck::index_t;

    std::vector<ck::utils::conv::ConvParam> conv_params;

@@ -50,31 +50,28 @@ class TestGroupedConvndFwd : public ::testing::Test

 using namespace ck::tensor_layout::convolution;

-using KernelTypes1d = ::testing::Types<std::tuple<float, GNWC, GKXC, GNWK, ck::index_t>,
-                                       std::tuple<ck::half_t, GNWC, GKXC, GNWK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, GNWC, GKXC, GNWK, ck::index_t>,
-                                       std::tuple<int8_t, GNWC, GKXC, GNWK, ck::index_t>>;
-
-using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK, ck::index_t>,
-                                       std::tuple<ck::half_t, GNHWC, GKYXC, GNHWK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, GNHWC, GKYXC, GNHWK, ck::index_t>,
-                                       std::tuple<int8_t, GNHWC, GKYXC, GNHWK, ck::index_t>,
-                                       std::tuple<float, NHWGC, GKYXC, NHWGK, ck::index_t>,
-                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK, ck::index_t>,
-                                       std::tuple<int8_t, NHWGC, GKYXC, NHWGK, ck::index_t>>;
-
-using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
-                                       std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
-                                       std::tuple<int8_t, GNDHWC, GKZYXC, GNDHWK, ck::index_t>,
-                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK, ck::index_t>,
-                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK, ck::index_t>,
-                                       std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK, ck::index_t>,
-                                       std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK, ck::index_t>>;
-
-using KernelTypes2dLargeCases =
-    ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK, ck::long_index_t>>;
+using KernelTypes1d = ::testing::Types<std::tuple<float, GNWC, GKXC, GNWK>,
+                                       std::tuple<ck::half_t, GNWC, GKXC, GNWK>,
+                                       std::tuple<ck::bhalf_t, GNWC, GKXC, GNWK>,
+                                       std::tuple<int8_t, GNWC, GKXC, GNWK>>;
+
+using KernelTypes2d = ::testing::Types<std::tuple<float, GNHWC, GKYXC, GNHWK>,
+                                       std::tuple<ck::half_t, GNHWC, GKYXC, GNHWK>,
+                                       std::tuple<ck::bhalf_t, GNHWC, GKYXC, GNHWK>,
+                                       std::tuple<int8_t, GNHWC, GKYXC, GNHWK>,
+                                       std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<int8_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>,
+                                       std::tuple<ck::half_t, GNDHWC, GKZYXC, GNDHWK>,
+                                       std::tuple<ck::bhalf_t, GNDHWC, GKZYXC, GNDHWK>,
+                                       std::tuple<int8_t, GNDHWC, GKZYXC, GNDHWK>,
+                                       std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK>>;

 template <typename Tuple>
 class TestGroupedConvndFwd1d : public TestGroupedConvndFwd<Tuple>
@@ -91,15 +88,9 @@ class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
 {
 };

-template <typename Tuple>
-class TestGroupedConvndFwd2dLargeCases : public TestGroupedConvndFwd<Tuple>
-{
-};
-
 TYPED_TEST_SUITE(TestGroupedConvndFwd1d, KernelTypes1d);
 TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
 TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
-TYPED_TEST_SUITE(TestGroupedConvndFwd2dLargeCases, KernelTypes2dLargeCases);

 TYPED_TEST(TestGroupedConvndFwd1d, Test1D)
 {
@@ -149,17 +140,3 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
        {3, 96, 1, 1, 1, {3, 3, 3}, {4, 30, 160}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
-
-TYPED_TEST(TestGroupedConvndFwd2dLargeCases, Test2DLargeCases)
-{
-    // Case larger than 2GB
-    this->conv_params.push_back(
-        {2, 1, 64, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
-    // With supported NumGroupsToMerge > 1
-    this->conv_params.push_back(
-        {2, 32, 64, 1, 1, {2, 2}, {672, 672}, {672, 672}, {1, 1}, {0, 0}, {0, 0}});
-    // When image is larger than 2GB
-    this->conv_params.push_back(
-        {2, 1, 1, 256, 256, {3, 3}, {4096, 2048}, {1024, 1024}, {3, 3}, {1, 1}, {1, 1}});
-    this->template Run<2>();
-}
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
+
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::long_index_t;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                       InLayout,
+                                                                       WeiLayout,
+                                                                       OutLayout,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       IndexType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes2d = ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>;
+
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    // Case larger than 2GB
+    this->conv_params.push_back(
+        {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
+    // With supported NumGroupsToMerge > 1
+    this->conv_params.push_back(
+        {2, 32, 64, 1, 1, {2, 2}, {672, 672}, {672, 672}, {1, 1}, {0, 0}, {0, 0}});
+    // When image is larger than 2GB
+    this->conv_params.push_back(
+        {2, 2, 2, 128, 128, {3, 3}, {4096, 2048}, {300, 300}, {3, 3}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    // Case larger than 2GB
+    this->conv_params.push_back({3,
+                                 1,
+                                 128,
+                                 4,
+                                 192,
+                                 {2, 2, 2},
+                                 {2, 224, 224},
+                                 {1, 224, 224},
+                                 {1, 1, 1},
+                                 {0, 0, 0},
+                                 {0, 0, 0}});
+    // With supported NumGroupsToMerge > 1
+    this->conv_params.push_back({3,
+                                 32,
+                                 64,
+                                 1,
+                                 1,
+                                 {2, 2, 2},
+                                 {360, 2, 672},
+                                 {360, 2, 672},
+                                 {1, 1, 1},
+                                 {0, 0, 0},
+                                 {0, 0, 0}});
+    // When image is larger than 2GB
+    this->conv_params.push_back({3,
+                                 1,
+                                 2,
+                                 128,
+                                 128,
+                                 {3, 1, 3},
+                                 {900, 2, 2048},
+                                 {300, 1, 300},
+                                 {3, 2, 3},
+                                 {1, 1, 1},
+                                 {1, 1, 1}});
+    this->template Run<3>();
+}