merge with the develop branch

f6ceef78 · ThomasNing · 536c5458 · 25935b57 · f6ceef78 · f6ceef78
Commit f6ceef78 authored Aug 26, 2024 by ThomasNing
20 changed files
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -15,9 +15,27 @@ python3 process_perf_data.py perf_resnet50_N256.log
 python3 process_perf_data.py perf_resnet50_N4.log
 python3 process_perf_data.py perf_batched_gemm.log
 python3 process_perf_data.py perf_grouped_gemm.log
-python3 process_perf_data.py perf_conv_fwd.log
+python3 process_perf_data.py perf_grouped_conv_fwd.log
-python3 process_perf_data.py perf_conv_bwd_data.log
+python3 process_perf_data.py perf_grouped_conv_bwd_data.log
+python3 process_perf_data.py perf_grouped_conv_bwd_weight.log
 python3 process_perf_data.py perf_gemm_bilinear.log
 python3 process_perf_data.py perf_reduction.log
 python3 process_perf_data.py perf_splitK_gemm.log
 python3 process_perf_data.py perf_onnx_gemm.log
+file=./perf_fmha_fwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx942.log
+fi
+file=./perf_fmha_bwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx942.log
+fi
+file=./perf_fmha_fwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
+fi
+file=./perf_fmha_bwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
+fi
--- a/script/profile_conv_bwd_data.sh
+++ b/script/profile_conv_bwd_data.sh
--- a/script/profile_conv_fwd.sh
+++ b/script/profile_conv_fwd.sh
@@ -12,27 +12,28 @@ INIT=$5
 LOG=$6
 TIME=$7
- N=$8
+N=$8
+SplitK=$9
 # Resnet50
 ########  op  datatype  layout  verify  init  log  time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1 $SplitK
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3 $SplitK
--- a/script/profile_grouped_conv_fwd.sh
+++ b/script/profile_grouped_conv_fwd.sh
+#!/bin/bash
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+INDEXTYPE=$4
+VERIFY=$5
+INIT=$6
+LOG=$7
+TIME=$8
+ N=$9
+# Resnet50
+########  op  datatype  indextype  layout  verify  init  log  time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3
--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -90,21 +90,27 @@ print_log_header $gemm_bilinear_log $env_type $branch $host_name
 ./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
 ./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
-#run conv_fwd tests
+#run grouped_fwd tests
-export conv_fwd_log="perf_conv_fwd.log"
+export grouped_conv_fwd_log="perf_grouped_conv_fwd.log"
-print_log_header $conv_fwd_log $env_type $branch $host_name
+print_log_header $grouped_conv_fwd_log $env_type $branch $host_name
-./profile_conv_fwd.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 0 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
-./profile_conv_fwd.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
-./profile_conv_fwd.sh conv_fwd 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 2 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
-./profile_conv_fwd.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
-#run conv_bwd_data tests
+#run grouped_bwd_data tests
-export conv_bwd_data_log="perf_conv_bwd_data.log"
+export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data.log"
-print_log_header $conv_bwd_data_log $env_type $branch $host_name
+print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
-./profile_conv_bwd_data.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
-./profile_conv_bwd_data.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
-./profile_conv_bwd_data.sh conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
-./profile_conv_bwd_data.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+#run grouped_bwd_weight tests
+export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight.log"
+print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 0 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 2 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 4 2>&1 | tee -a $grouped_conv_bwd_weight_log
 #run resnet50 tests
 export resnet256_log="perf_resnet50_N256.log"

--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
@@ -51,6 +51,21 @@ print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log
 ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log
+#run grouped_fwd fp16 tests
+export grouped_conv_fwd_log="perf_grouped_conv_fwd_fp16.log"
+print_log_header $conv_fwd_log $env_type $branch $host_name
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
+#run grouped_bwd_data fp16 tests
+export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data_fp16.log"
+print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
+#run grouped_bwd_weight fp16 tests
+export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight_fp16.log"
+print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 1 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
 #run resnet50 tests
 export resnet256_log="perf_resnet50_N256.log"
 print_log_header $resnet256_log $env_type $branch $host_name

--- a/script/test_reduce_with_index.sh
+++ b/script/test_reduce_with_index.sh
-#!/bin/bash
-## The following will be used for CI
-set -x
-## for float
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
-## for float64
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  6 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  6 2
-## for float16
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  1 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  1 2
-## for int8_t
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  3 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  3 2
-## for bfloat16
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 0  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 1  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 2  5 2
-bin/test_reduce_with_index -D 64,4,280,82  -R 3  5 2
-set +x
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,11 +68,11 @@ function(add_test_executable TEST_NAME)
    #only continue if there are some source files left on the list
    if(ARGN)
        if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(ARGN MATCHES "_wmma")
             list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})
@@ -149,11 +149,11 @@ function(add_gtest_executable TEST_NAME)
    #only continue if there are some source files left on the list
    if(ARGN)
        if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(ARGN MATCHES "_wmma")
             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a)
+             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})

--- a/test/conv_util/conv_util.cpp
+++ b/test/conv_util/conv_util.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <string>
@@ -24,12 +24,12 @@ class TestConvUtil : public ::testing::Test
                                                 128,
                                                 192,
                                                 256,
-                                                 std::vector<ck::index_t>(ndims, 3),
+                                                 std::vector<ck::long_index_t>(ndims, 3),
-                                                 std::vector<ck::index_t>(ndims, 71),
+                                                 std::vector<ck::long_index_t>(ndims, 71),
-                                                 std::vector<ck::index_t>(ndims, s),
+                                                 std::vector<ck::long_index_t>(ndims, s),
-                                                 std::vector<ck::index_t>(ndims, d),
+                                                 std::vector<ck::long_index_t>(ndims, d),
-                                                 std::vector<ck::index_t>(ndims, p),
+                                                 std::vector<ck::long_index_t>(ndims, p),
-                                                 std::vector<ck::index_t>(ndims, p));
+                                                 std::vector<ck::long_index_t>(ndims, p));
    }
    protected:
@@ -48,35 +48,35 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths1D)
 {
    // stride 2, dilation 1, pad 1
    SetNDParams(1, 2, 1, 1);
-    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    std::vector<ck::long_index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D."));
+        out_spatial_len, std::vector<ck::long_index_t>{36}, "Error: ConvParams 1D."));
    // stride 1, dilation 1, pad 1
    SetNDParams(1, 1, 1, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}."));
+        out_spatial_len, std::vector<ck::long_index_t>{71}, "Error: ConvParams 1D stride {1}."));
    // stride 2, dilation 1, pad 2
    SetNDParams(1, 2, 1, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{37},
+                                     std::vector<ck::long_index_t>{37},
                                     "Error: ConvParams 1D padding left/right {2}."));
    // stride 2, dilation 2, pad 2
    SetNDParams(1, 2, 2, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}."));
+        out_spatial_len, std::vector<ck::long_index_t>{36}, "Error: ConvParams 1D dilation {2}."));
    // stride 3, dilation 2, pad 1
    SetNDParams(1, 3, 2, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(
        ck::utils::check_err(out_spatial_len,
-                             std::vector<ck::index_t>{23},
+                             std::vector<ck::long_index_t>{23},
                             "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."));
 }
@@ -84,36 +84,38 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths2D)
 {
    // stride 2, dilation 1, pad 1
    SetNDParams(2, 2, 1, 1);
-    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    std::vector<ck::long_index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{36, 36},
+                                     std::vector<ck::long_index_t>{36, 36},
                                     "Error: ConvParams 2D default constructor."));
    // stride 1, dilation 1, pad 1
    SetNDParams(2, 1, 1, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
-    EXPECT_TRUE(ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-        out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}."));
+                                     std::vector<ck::long_index_t>{71, 71},
+                                     "Error: ConvParams 2D stride {1,1}."));
    // stride 2, dilation 1, pad 2
    SetNDParams(2, 2, 1, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{37, 37},
+                                     std::vector<ck::long_index_t>{37, 37},
                                     "Error: ConvParams 2D padding left/right {2,2}."));
    // stride 2, dilation 2, pad 2
    SetNDParams(2, 2, 2, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
-    EXPECT_TRUE(ck::utils::check_err(
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-        out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}."));
+                                     std::vector<ck::long_index_t>{36, 36},
+                                     "Error: ConvParams 2D dilation {2,2}."));
    // stride 3, dilation 2, pad 1
    SetNDParams(2, 3, 2, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(
        ck::utils::check_err(out_spatial_len,
-                             std::vector<ck::index_t>{23, 23},
+                             std::vector<ck::long_index_t>{23, 23},
                             "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."));
 }
@@ -121,29 +123,29 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths3D)
 {
    // stride 2, dilation 1, pad 1
    SetNDParams(3, 2, 1, 1);
-    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    std::vector<ck::long_index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
-        out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D."));
+        out_spatial_len, std::vector<ck::long_index_t>{36, 36, 36}, "Error: ConvParams 3D."));
    // stride 1, dilation 1, pad 1
    SetNDParams(3, 1, 1, 1);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{71, 71, 71},
+                                     std::vector<ck::long_index_t>{71, 71, 71},
                                     "Error: ConvParams 3D stride {1, 1, 1}."));
    // stride 2, dilation 1, pad 2
    SetNDParams(3, 2, 1, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{37, 37, 37},
+                                     std::vector<ck::long_index_t>{37, 37, 37},
                                     "Error: ConvParams 3D padding left/right {2, 2, 2}."));
    // stride 2, dilation 2, pad 2
    SetNDParams(3, 2, 2, 2);
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
-                                     std::vector<ck::index_t>{36, 36, 36},
+                                     std::vector<ck::long_index_t>{36, 36, 36},
                                     "Error: ConvParams 3D dilation {2, 2, 2}."));
    // stride 3, dilation 2, pad 1
@@ -151,6 +153,6 @@ TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths3D)
    out_spatial_len = conv_params.GetOutputSpatialLengths();
    EXPECT_TRUE(ck::utils::check_err(
        out_spatial_len,
-        std::vector<ck::index_t>{23, 23, 23},
+        std::vector<ck::long_index_t>{23, 23, 23},
        "Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}."));
 }
--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
+if (GPU_TARGETS)
+    if (GPU_TARGETS MATCHES "gfx10" OR GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
+        add_definitions(-DCK_SKIP_FLAKY_F8_TEST)
+        set(CK_SKIP_FLAKY_F8_TEST "ON")
+    endif()
+else()
+    add_definitions(-DCK_SKIP_FLAKY_F8_TEST)
+    set(CK_SKIP_FLAKY_F8_TEST "ON")
+endif()
 if (USE_BITINT_EXTENSION_INT4)
  add_gtest_executable(test_int4 test_int4.cpp)
  if(result EQUAL 0)

--- a/test/data_type/test_bf8.cpp
+++ b/test/data_type/test_bf8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "gtest/gtest.h"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
 using ck::bf8_t;
+using ck::f8_convert_rne;
 using ck::f8_convert_sr;
 using ck::half_t;
 using ck::type_convert;
@@ -24,33 +25,36 @@ TEST(BF8, ConvertFP32Nearest)
    // fix the tolerance value
    float abs_tol = 1e-6;
    // convert 0 float to bf8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(type_convert<bf8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<bf8_t>(0.0f)), abs_tol);
+    // don't run the next test on gfx11 devices
+#ifndef CK_SKIP_FLAKY_F8_TEST
    // convert minimal float to bf8 and back, check if holds
    ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(type_convert<bf8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_rne<bf8_t>(std::numeric_limits<float>::min())),
                abs_tol);
+#endif
    // convert maximal bf8_t to float and check if equal to 57344.0
-    ASSERT_NEAR(57344.0f, type_convert<float>(type_convert<bf8_t>(57344.0f)), abs_tol);
+    ASSERT_NEAR(57344.0f, type_convert<float>(f8_convert_rne<bf8_t>(57344.0f)), abs_tol);
    // convert maximal float to bf8 and back, check if clipped to 57344.0
    ASSERT_NEAR(57344.0f,
-                type_convert<float>(type_convert<bf8_t>(std::numeric_limits<float>::max())),
+                type_convert<float>(f8_convert_rne<bf8_t>(std::numeric_limits<float>::max())),
                abs_tol);
    // convert inf float to bf8_t and check if it is qNan
    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                type_convert<bf8_t>(std::numeric_limits<float>::infinity()),
+                f8_convert_rne<bf8_t>(std::numeric_limits<float>::infinity()),
                abs_tol);
    // positive norm float value to bf8 and back, check if holds
    float pos_float = 0.0000762939f;
-    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_t>(pos_float)), abs_tol);
    // negative norm float value to bf8 and back, check if holds
    float neg_float = -0.0000610351f;
-    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_t>(neg_float)), abs_tol);
    // positive subnorm float value to bf8 and back, check if holds
    pos_float = 0.0000305175f;
-    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_t>(pos_float)), abs_tol);
    // negative subnorm float value to bf8 and back, check if holds
    neg_float = -0.0000152587f;
-    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_t>(neg_float)), abs_tol);
 }
 TEST(BF8, ConvertFP32Stochastic)
@@ -92,34 +96,34 @@ TEST(BF8, ConvertFP16Nearest)
    // fix the tolerance value
    float abs_tol = 1e-3;
    // convert 0 fp16 to bf8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(type_convert<bf8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_rne<bf8_t>(half_t{0.0})), abs_tol);
    // convert minimal fp16 to bf8 and back, check if holds
    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(type_convert<bf8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::Min())),
                abs_tol);
    // convert maximal bf8_t to fp16 and check if equal to 57344.0
    ASSERT_NEAR(
-        half_t{57344.0}, type_convert<half_t>(type_convert<bf8_t>(half_t{57344.0})), abs_tol);
+        half_t{57344.0}, type_convert<half_t>(f8_convert_rne<bf8_t>(half_t{57344.0})), abs_tol);
    // convert maximal fp16 to bf8 and back, check if clipped to 57344.0
    ASSERT_NEAR(half_t{57344.0},
-                type_convert<half_t>(type_convert<bf8_t>(ck::NumericLimits<half_t>::Max())),
+                type_convert<half_t>(f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::Max())),
                abs_tol);
    // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN
    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                type_convert<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+                f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
                abs_tol);
    // positive norm fp16 value to bf8 and back, check if holds
    half_t pos_half = half_t{0.0000762939};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_t>(pos_half)), abs_tol);
    // negative norm fp16 value to bf8 and back, check if holds
    half_t neg_half = half_t{-0.0000610351};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_t>(neg_half)), abs_tol);
    // positive subnorm fp16 value to bf8 and back, check if holds
    pos_half = half_t{0.0000305175};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_t>(pos_half)), abs_tol);
    // negative subnorm fp16 value to bf8 and back, check if holds
    neg_half = half_t{-0.0000152587};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_t>(neg_half)), abs_tol);
 }
 TEST(BF8, ConvertFP16Stochastic)

--- a/test/data_type/test_fp8.cpp
+++ b/test/data_type/test_fp8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "gtest/gtest.h"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
+using ck::f8_convert_rne;
 using ck::f8_convert_sr;
 using ck::f8_t;
 using ck::half_t;
@@ -24,33 +25,36 @@ TEST(FP8, ConvertFP32Nearest)
    // fix the tolerance value
    float abs_tol = 1e-6;
    // convert 0 float to fp8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(type_convert<f8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<f8_t>(0.0f)), abs_tol);
+    // don't run the next test on gfx11 devices
+#ifndef CK_SKIP_FLAKY_F8_TEST
    // convert minimal float to fp8 and back, check if holds
    ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(type_convert<f8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_rne<f8_t>(std::numeric_limits<float>::min())),
                abs_tol);
+#endif
    // convert maximal f8_t to float and check if equal to 240.0
-    ASSERT_NEAR(240.0f, type_convert<float>(type_convert<f8_t>(240.0f)), abs_tol);
+    ASSERT_NEAR(240.0f, type_convert<float>(f8_convert_rne<f8_t>(240.0f)), abs_tol);
    // convert maximal float to fp8 and back, check if clipped to 240.0
    ASSERT_NEAR(240.0f,
-                type_convert<float>(type_convert<f8_t>(std::numeric_limits<float>::max())),
+                type_convert<float>(f8_convert_rne<f8_t>(std::numeric_limits<float>::max())),
                abs_tol);
    // convert inf float to f8_t and check if it is qNan
    ASSERT_NEAR(type_convert<f8_t>(0x80),
-                type_convert<f8_t>(std::numeric_limits<float>::infinity()),
+                f8_convert_rne<f8_t>(std::numeric_limits<float>::infinity()),
                abs_tol);
    // positive norm float value to fp8 and back, check if holds
    float pos_float = 0.017578125f;
-    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<f8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_t>(pos_float)), abs_tol);
    // negative norm float value to fp8 and back, check if holds
    float neg_float = -0.015625f;
-    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<f8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_t>(neg_float)), abs_tol);
    // positive subnorm float value to fp8 and back, check if holds
    pos_float = 0.00390625f;
-    ASSERT_NEAR(pos_float, type_convert<float>(type_convert<f8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<f8_t>(pos_float)), abs_tol);
    // negative subnorm float value to fp8 and back, check if holds
    neg_float = -0.001953125f;
-    ASSERT_NEAR(neg_float, type_convert<float>(type_convert<f8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<f8_t>(neg_float)), abs_tol);
 }
 TEST(FP8, ConvertFP32Stochastic)
@@ -92,33 +96,33 @@ TEST(FP8, ConvertFP16Nearest)
    // fix the tolerance value
    float abs_tol = 1e-3;
    // convert 0 fp16 to fp8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(type_convert<f8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_rne<f8_t>(half_t{0.0})), abs_tol);
    // convert minimal fp16 to fp8 and back, check if holds
    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(type_convert<f8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_rne<f8_t>(ck::NumericLimits<half_t>::Min())),
                abs_tol);
    // convert maximal f8_t to fp16 and check if equal to 240.0
-    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(type_convert<f8_t>(half_t{240.0})), abs_tol);
+    ASSERT_NEAR(half_t{240.0}, type_convert<half_t>(f8_convert_rne<f8_t>(half_t{240.0})), abs_tol);
    // convert maximal fp16 to fp8 and back, check if clipped to 240.0
    ASSERT_NEAR(half_t{240.0},
-                type_convert<half_t>(type_convert<f8_t>(ck::NumericLimits<half_t>::Max())),
+                type_convert<half_t>(f8_convert_rne<f8_t>(ck::NumericLimits<half_t>::Max())),
                abs_tol);
    // convert QuietNaN fp16 to f8_t and check if it is QuietNaN
    ASSERT_NEAR(type_convert<f8_t>(0x80),
-                type_convert<f8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+                f8_convert_rne<f8_t>(ck::NumericLimits<half_t>::QuietNaN()),
                abs_tol);
    // positive norm fp16 value to fp8 and back, check if holds
    half_t pos_half = half_t{0.017578125};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<f8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<f8_t>(pos_half)), abs_tol);
    // negative norm fp16 value to fp8 and back, check if holds
    half_t neg_half = half_t{-0.015625};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<f8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<f8_t>(neg_half)), abs_tol);
    // positive subnorm fp16 value to fp8 and back, check if holds
    pos_half = half_t{0.00390625};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(type_convert<f8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<f8_t>(pos_half)), abs_tol);
    // negative subnorm fp16 value to fp8 and back, check if holds
    neg_half = half_t{-0.001953125};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(type_convert<f8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<f8_t>(neg_half)), abs_tol);
 }
 TEST(FP8, ConvertFP16Stochastic)

--- a/test/gemm_universal/test_gemm_universal_xdl.cpp
+++ b/test/gemm_universal/test_gemm_universal_xdl.cpp
@@ -44,17 +44,22 @@ class TestGemmUniversal_MK_NK
 using KernelTypes_MK_KN = ::testing::Types<
    //         ADataType, BDataType, ComputeDataType, CDataType
    std::tuple<      F16,       F16,             F16,     F16>,
+#if (defined CK_ENABLE_FP8)
    std::tuple<      F16,        F8,             F16,     F16>,
    std::tuple<       F8,       F16,             F16,     F16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
+#endif
    std::tuple<     BF16,      BF16,            BF16,    BF16>
    >;
 using KernelTypes_MK_NK = ::testing::Types<
    //         ADataType, BDataType, ComputeDataType, CDataType
    std::tuple<      F16,       F16,             F16,     F16>,
+#if (defined CK_ENABLE_FP8)
    std::tuple<      F16,        F8,             F16,     F16>,
    std::tuple<       F8,       F16,             F16,     F16>,
-    std::tuple<     BF16,      BF16,            BF16,    BF16>,
+    std::tuple<       F8,        F8,              F8,    BF16>,
-    std::tuple<       F8,        F8,              F8,    BF16>
+#endif
+    std::tuple<     BF16,      BF16,            BF16,    BF16>
    >;
 // clang-format on

--- a/test/grouped_convnd_fwd/CMakeLists.txt
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
@@ -7,6 +7,12 @@ if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11")
    endif()
 endif()
+if(GPU_TARGETS MATCHES "gfx9")
+    add_executable(test_grouped_convnd_fwd_large_cases_xdl test_grouped_convnd_fwd_large_cases_xdl.cpp)
+    target_compile_options(test_grouped_convnd_fwd_large_cases_xdl PRIVATE -Wno-global-constructors -Wno-undef)
+    target_link_libraries(test_grouped_convnd_fwd_large_cases_xdl PRIVATE gtest_main getopt::getopt utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
+endif()
 add_gtest_executable(test_grouped_convnd_fwd_multi_ab_interface test_grouped_convnd_fwd_multi_ab_interface.cpp)
 if(result EQUAL 0)
    target_link_libraries(test_grouped_convnd_fwd_multi_ab_interface PRIVATE utility)

--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd.cpp
@@ -17,6 +17,7 @@ class TestGroupedConvndFwd : public ::testing::Test
    using InLayout  = std::tuple_element_t<1, Tuple>;
    using WeiLayout = std::tuple_element_t<2, Tuple>;
    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::index_t;
    std::vector<ck::utils::conv::ConvParam> conv_params;
@@ -33,7 +34,10 @@ class TestGroupedConvndFwd : public ::testing::Test
                                                                       OutLayout,
                                                                       DataType,
                                                                       DataType,
-                                                                       DataType>(
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       IndexType>(
                               true,  // do_verification
                               1,     // init_method: integer value
                               false, // do_log
@@ -69,8 +73,6 @@ using KernelTypes3d = ::testing::Types<std::tuple<float, GNDHWC, GKZYXC, GNDHWK>
                                       std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>,
                                       std::tuple<int8_t, NDHWGC, GKZYXC, NDHWGK>>;
-using KernelTypes2dLargeCases = ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK>>;
 template <typename Tuple>
 class TestGroupedConvndFwd1d : public TestGroupedConvndFwd<Tuple>
 {
@@ -86,15 +88,9 @@ class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
 {
 };
-template <typename Tuple>
-class TestGroupedConvndFwd2dLargeCases : public TestGroupedConvndFwd<Tuple>
-{
-};
 TYPED_TEST_SUITE(TestGroupedConvndFwd1d, KernelTypes1d);
 TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
 TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
-TYPED_TEST_SUITE(TestGroupedConvndFwd2dLargeCases, KernelTypes2dLargeCases);
 TYPED_TEST(TestGroupedConvndFwd1d, Test1D)
 {
@@ -144,14 +140,3 @@ TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
        {3, 96, 1, 1, 1, {3, 3, 3}, {4, 30, 160}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->template Run<3>();
 }
-TYPED_TEST(TestGroupedConvndFwd2dLargeCases, Test2DLargeCases)
-{
-    // Case larger than 2GB
-    this->conv_params.push_back(
-        {2, 1, 64, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
-    // With supported NumGroupsToMerge > 1
-    this->conv_params.push_back(
-        {2, 32, 64, 1, 1, {2, 2}, {672, 672}, {672, 672}, {1, 1}, {0, 0}, {0, 0}});
-    this->template Run<2>();
-}
--- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
+++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases_xdl.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
+template <typename Tuple>
+class TestGroupedConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType  = std::tuple_element_t<0, Tuple>;
+    using InLayout  = std::tuple_element_t<1, Tuple>;
+    using WeiLayout = std::tuple_element_t<2, Tuple>;
+    using OutLayout = std::tuple_element_t<3, Tuple>;
+    using IndexType = ck::long_index_t;
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                       InLayout,
+                                                                       WeiLayout,
+                                                                       OutLayout,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       DataType,
+                                                                       IndexType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+using namespace ck::tensor_layout::convolution;
+using KernelTypes2d = ::testing::Types<std::tuple<float, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::half_t, NHWGC, GKYXC, NHWGK>,
+                                       std::tuple<ck::bhalf_t, NHWGC, GKYXC, NHWGK>>;
+using KernelTypes3d = ::testing::Types<std::tuple<float, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::half_t, NDHWGC, GKZYXC, NDHWGK>,
+                                       std::tuple<ck::bhalf_t, NDHWGC, GKZYXC, NDHWGK>>;
+template <typename Tuple>
+class TestGroupedConvndFwd2d : public TestGroupedConvndFwd<Tuple>
+{
+};
+template <typename Tuple>
+class TestGroupedConvndFwd3d : public TestGroupedConvndFwd<Tuple>
+{
+};
+TYPED_TEST_SUITE(TestGroupedConvndFwd2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestGroupedConvndFwd3d, KernelTypes3d);
+TYPED_TEST(TestGroupedConvndFwd2d, Test2D)
+{
+    // Case larger than 2GB
+    this->conv_params.push_back(
+        {2, 1, 128, 4, 192, {2, 2}, {224, 224}, {224, 224}, {1, 1}, {0, 0}, {0, 0}});
+    // With supported NumGroupsToMerge > 1
+    this->conv_params.push_back(
+        {2, 32, 64, 1, 1, {2, 2}, {672, 672}, {672, 672}, {1, 1}, {0, 0}, {0, 0}});
+    // When image is larger than 2GB
+    this->conv_params.push_back(
+        {2, 2, 2, 128, 128, {3, 3}, {4096, 2048}, {300, 300}, {3, 3}, {1, 1}, {1, 1}});
+    this->template Run<2>();
+}
+TYPED_TEST(TestGroupedConvndFwd3d, Test3D)
+{
+    // Case larger than 2GB
+    this->conv_params.push_back({3,
+                                 1,
+                                 128,
+                                 4,
+                                 192,
+                                 {2, 2, 2},
+                                 {2, 224, 224},
+                                 {1, 224, 224},
+                                 {1, 1, 1},
+                                 {0, 0, 0},
+                                 {0, 0, 0}});
+    // With supported NumGroupsToMerge > 1
+    this->conv_params.push_back({3,
+                                 32,
+                                 64,
+                                 1,
+                                 1,
+                                 {2, 2, 2},
+                                 {360, 2, 672},
+                                 {360, 2, 672},
+                                 {1, 1, 1},
+                                 {0, 0, 0},
+                                 {0, 0, 0}});
+    // When image is larger than 2GB
+    this->conv_params.push_back({3,
+                                 1,
+                                 2,
+                                 128,
+                                 128,
+                                 {3, 1, 3},
+                                 {900, 2, 2048},
+                                 {300, 1, 300},
+                                 {3, 2, 3},
+                                 {1, 1, 1},
+                                 {1, 1, 1}});
+    this->template Run<3>();
+}
--- a/test/reduce/CMakeLists.txt
+++ b/test/reduce/CMakeLists.txt
-add_test_executable(test_reduce_no_index reduce_no_index.cpp)
+add_gtest_executable(test_reduce_no_index reduce_no_index.cpp)
-add_test_executable(test_reduce_with_index reduce_with_index.cpp)
+add_gtest_executable(test_reduce_with_index reduce_with_index.cpp)
 target_link_libraries(test_reduce_no_index PRIVATE utility device_reduce_instance)
 target_link_libraries(test_reduce_with_index PRIVATE utility device_reduce_instance)
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <getopt.h>
 #include "ck/library/utility/host_common_util.hpp"
 #include "profiler/profile_reduce_impl.hpp"
+#include <gtest/gtest.h>
 using namespace ck;
-static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+struct ReduceParam
-                                       {"reduceDimensions", required_argument, nullptr, 'R'},
+{
-                                       {"scales", required_argument, nullptr, 'S'},
+    bool do_verification{true};
-                                       {"help", no_argument, nullptr, '?'},
+    bool propagateNan{false};
-                                       {nullptr, 0, nullptr, 0}};
+    bool useIndex{false};
+    bool time_kernel{false};
+    bool do_dumpout{false};
+    int init_method{2};
+    float alpha{1.0f};
+    float beta{0.0f};
+    std::vector<size_t> inLengths{64, 4, 280, 82};
+    std::vector<int> reduceDims{0, 1, 2, 3};
+};
-class SimpleAppArgs
+std::vector<std::vector<int>> SetGenericReduceDim()
 {
-    private:
+    return {{0, 1, 2, 3}, {0, 1, 2}, {0, 1, 3}, {0, 2, 3}, {1, 2, 3}, {0}, {1}, {2}, {3}};
-    int option_index = 0;
+}
-    public:
+template <typename T>
-    std::vector<size_t> inLengths;
+class ReduceWithIndexTest : public ::testing::Test
-    std::vector<int> reduceDims;
+{
-    std::vector<float> scales;
+    protected:
+    using InDataType  = std::tuple_element_t<0, T>;
+    using AccDataType = std::tuple_element_t<1, T>;
+    using OutDataType = std::tuple_element_t<2, T>;
-    int data_type;
+    static std::vector<ReduceParam> params;
-    int init_method = 1;
-    public:
+    static void SetUpTestSuite()
-    void show_usage(const char* cmd)
    {
-        std::cout << "Usage of " << cmd << std::endl;
+        // set testcase variables
-        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths "
+        ReduceParam set;
-                     "(only 4-d tensor supported)"
+        const auto setReduceDim = SetGenericReduceDim();
-                  << std::endl;
-        std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
-                     "(only 1 or 3 or 4 dimensions supported)"
-                  << std::endl;
-        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
-                  << std::endl;
-        std::cout << "Arg1 -- data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
-        std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
-                     "value, 3=decimal value)"
-                  << std::endl;
-    };
-    int processArgs(int argc, char* argv[])
-    {
-        using ck::host_common::getTypeValuesFromString;
-        int ch;
-        while(1)
+        for(std::size_t i(0); i < setReduceDim.size(); ++i)
        {
-            ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index);
+            set.reduceDims = setReduceDim[i];
-            if(ch == -1)
+            params.emplace_back(set);
-                break;
+        }
-            switch(ch)
+    }
-            {
-            case 'D':
+    template <ReduceTensorOp ReduceOpIdType>
-                if(!optarg)
+    void Run()
-                    throw std::runtime_error("Invalid option format!");
+    {
+        for(auto param : this->params)
-                inLengths = getTypeValuesFromString<size_t>(optarg);
-                break;
-            case 'R':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                reduceDims = getTypeValuesFromString<int>(optarg);
-                break;
-            case 'S':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                scales = getTypeValuesFromString<float>(optarg);
-                break;
-            case '?':
-                if(std::string(long_options[option_index].name) == "help")
-                {
-                    show_usage(argv[0]);
-                    return (-1);
-                };
-                break;
-            default: show_usage(argv[0]); return (-1);
-            };
-        };
-        if(optind + 2 > argc)
-            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
-        data_type   = std::atoi(argv[optind++]);
-        init_method = std::atoi(argv[optind]);
-        if(scales.empty())
        {
-            scales.push_back(1.0f);
+            bool success = ck::profiler::profile_reduce_impl<InDataType, AccDataType, OutDataType>(
-            scales.push_back(0.0f);
+                param.do_verification,
-        };
+                param.init_method,
+                param.do_dumpout,
+                param.time_kernel,
+                param.inLengths,
+                param.reduceDims,
+                ReduceOpIdType,
+                param.propagateNan,
+                param.useIndex,
+                param.alpha,
+                param.beta);
+            EXPECT_TRUE(success);
+        }
+    }
+};
-        if(inLengths.size() != 4 ||
+template <typename T>
-           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
+std::vector<ReduceParam> ReduceWithIndexTest<T>::params = {};
-            return (-1);
-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+using Reduce_float_types       = ::testing::Types<std::tuple<float, float, float>>;
-            return (-1);
+using Reduce_double_types      = ::testing::Types<std::tuple<double, double, double>>;
+using Reduce_int8t_types       = ::testing::Types<std::tuple<int8_t, int8_t, int8_t>>;
+using Reduce_half_types        = ::testing::Types<std::tuple<ck::half_t, ck::half_t, ck::half_t>>;
+using Reduce_bhalf_float_Types = ::testing::Types<std::tuple<ck::bhalf_t, float, ck::bhalf_t>>;
-        return (0);
+template <typename TType>
-    };
+class ReduceWithNoIndexFloat : public ReduceWithIndexTest<TType>
+{
 };
-bool test_reduce_no_index(int data_type,
+template <typename TType>
-                          int init_method,
+class ReduceWithNoIndexDouble : public ReduceWithIndexTest<TType>
-                          std::vector<int> reduceDims,
-                          std::vector<size_t> inLengths,
-                          ReduceTensorOp reduceOpId,
-                          bool propagateNan,
-                          float alpha,
-                          float beta)
 {
-    using ck::profiler::profile_reduce_impl;
+};
-    bool result = true;
+template <typename TType>
+class ReduceWithNoIndexInt8 : public ReduceWithIndexTest<TType>
+{
+};
-    if(data_type == 0)
+template <typename TType>
-    {
+class ReduceWithNoIndexHalf : public ReduceWithIndexTest<TType>
-        result = profile_reduce_impl<float, float, float>(true,
+{
-                                                          init_method,
+};
-                                                          false,
-                                                          false,
-                                                          inLengths,
-                                                          reduceDims,
-                                                          reduceOpId,
-                                                          propagateNan,
-                                                          false,
-                                                          alpha,
-                                                          beta);
-    }
-    else if(data_type == 1)
-    {
-        result = profile_reduce_impl<ck::half_t, float, ck::half_t>(true,
-                                                                    init_method,
-                                                                    false,
-                                                                    false,
-                                                                    inLengths,
-                                                                    reduceDims,
-                                                                    reduceOpId,
-                                                                    propagateNan,
-                                                                    false,
-                                                                    alpha,
-                                                                    beta);
-    }
-    else if(data_type == 3)
-    {
-        result = profile_reduce_impl<int8_t, int32_t, int8_t>(true,
-                                                              init_method,
-                                                              false,
-                                                              false,
-                                                              inLengths,
-                                                              reduceDims,
-                                                              reduceOpId,
-                                                              propagateNan,
-                                                              false,
-                                                              alpha,
-                                                              beta);
-    }
-    else if(data_type == 5)
-    {
-        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
-                                                                      init_method,
-                                                                      false,
-                                                                      false,
-                                                                      inLengths,
-                                                                      reduceDims,
-                                                                      reduceOpId,
-                                                                      propagateNan,
-                                                                      false,
-                                                                      alpha,
-                                                                      beta);
-    }
-    else if(data_type == 6)
-    {
-        result = profile_reduce_impl<double, double, double>(true,
-                                                             init_method,
-                                                             false,
-                                                             false,
-                                                             inLengths,
-                                                             reduceDims,
-                                                             reduceOpId,
-                                                             propagateNan,
-                                                             false,
-                                                             alpha,
-                                                             beta);
-    }
-    return (result);
+template <typename TType>
+class ReduceWithNoIndexBHalfFloat : public ReduceWithIndexTest<TType>
+{
 };
-constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AVG;
+TYPED_TEST_SUITE(ReduceWithNoIndexFloat, Reduce_float_types);
-constexpr bool propagateNan         = false;
+TYPED_TEST_SUITE(ReduceWithNoIndexDouble, Reduce_double_types);
+TYPED_TEST_SUITE(ReduceWithNoIndexInt8, Reduce_int8t_types);
+TYPED_TEST_SUITE(ReduceWithNoIndexHalf, Reduce_half_types);
+TYPED_TEST_SUITE(ReduceWithNoIndexBHalfFloat, Reduce_bhalf_float_Types);
-int main(int argc, char* argv[])
+TYPED_TEST(ReduceWithNoIndexFloat, ReduceWithNoIndexTestFloat_AMAX)
 {
-    SimpleAppArgs args;
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
-    bool result = true;
+TYPED_TEST(ReduceWithNoIndexFloat, ReduceWithNoIndexTestFloat_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
-    if(argc == 1)
+TYPED_TEST(ReduceWithNoIndexFloat, ReduceWithNoIndexTestFloat_MAX)
-    {
+{
-        int data_type   = 1;
+    // trigger Run() -> Generic
-        int init_method = 2;
+    this->template Run<ReduceTensorOp::MAX>();
-        std::vector<size_t> inLengths{64, 4, 280, 80};
+}
-        std::vector<std::vector<int>> v_reduceDims{
-            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
+TYPED_TEST(ReduceWithNoIndexDouble, ReduceWithNoIndexTestDouble_AMAX)
+{
-        for(auto& reduceDims : v_reduceDims)
+    // trigger Run() -> Generic
-            result = result && test_reduce_no_index(data_type,
+    this->template Run<ReduceTensorOp::AMAX>();
-                                                    init_method,
+}
-                                                    reduceDims,
-                                                    inLengths,
+TYPED_TEST(ReduceWithNoIndexDouble, ReduceWithNoIndexTestDouble_MIN)
-                                                    reduceOpId,
+{
-                                                    propagateNan,
+    // trigger Run() -> Generic
-                                                    1.0f,
+    this->template Run<ReduceTensorOp::MIN>();
-                                                    0.0f);
+}
-    }
-    else
+TYPED_TEST(ReduceWithNoIndexDouble, ReduceWithNoIndexTestDouble_MAX)
-    {
+{
-        if(args.processArgs(argc, argv) < 0)
+    // trigger Run() -> Generic
-        {
+    this->template Run<ReduceTensorOp::MAX>();
-            throw std::runtime_error(
+}
-                "Invalid input arguments, test_reduce_no_index could not be executed!");
-        };
+TYPED_TEST(ReduceWithNoIndexInt8, ReduceWithNoIndexTestInt8_AMAX)
+{
-        result = test_reduce_no_index(args.data_type,
+    // trigger Run() -> Generic
-                                      args.init_method,
+    this->template Run<ReduceTensorOp::AMAX>();
-                                      args.reduceDims,
+}
-                                      args.inLengths,
-                                      reduceOpId,
+TYPED_TEST(ReduceWithNoIndexInt8, ReduceWithNoIndexTestInt8_MIN)
-                                      propagateNan,
+{
-                                      args.scales[0],
+    // trigger Run() -> Generic
-                                      args.scales[1]);
+    this->template Run<ReduceTensorOp::MIN>();
-    }
+}
-    std::cout << "test_reduce_no_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl;
+TYPED_TEST(ReduceWithNoIndexInt8, ReduceWithNoIndexTestInt8_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
+TYPED_TEST(ReduceWithNoIndexHalf, ReduceWithNoIndexTestHalf_AMAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
+TYPED_TEST(ReduceWithNoIndexHalf, ReduceWithNoIndexTestHalf_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
+TYPED_TEST(ReduceWithNoIndexHalf, ReduceWithNoIndexTestHalf_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
+TYPED_TEST(ReduceWithNoIndexBHalfFloat, ReduceWithNoIndexTesBtHalfFloat_AMAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
-    return (result ? 0 : -1);
+TYPED_TEST(ReduceWithNoIndexBHalfFloat, ReduceWithNoIndexTestBHalfFloat_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
+TYPED_TEST(ReduceWithNoIndexBHalfFloat, ReduceWithNoIndexTestBHalfFloat_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
 }
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <getopt.h>
 #include "ck/library/utility/host_common_util.hpp"
 #include "profiler/profile_reduce_impl.hpp"
+#include <gtest/gtest.h>
 using namespace ck;
-static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+struct ReduceParam
-                                       {"reduceDimensions", required_argument, nullptr, 'R'},
+{
-                                       {"scales", required_argument, nullptr, 'S'},
+    bool do_verification{true};
-                                       {"help", no_argument, nullptr, '?'},
+    bool propagateNan{false};
-                                       {nullptr, 0, nullptr, 0}};
+    bool useIndex{false};
+    bool time_kernel{false};
+    bool do_dumpout{false};
+    int init_method{2};
+    float alpha{1.0f};
+    float beta{0.0f};
+    std::vector<size_t> inLengths{64, 4, 280, 82};
+    std::vector<int> reduceDims{0, 1, 2, 3};
+};
-class SimpleAppArgs
+std::vector<std::vector<int>> SetGenericReduceDim()
 {
-    private:
+    return {{0, 1, 2, 3}, {0, 1, 2}, {0, 1, 3}, {0, 2, 3}, {1, 2, 3}, {0}, {1}, {2}, {3}};
-    int option_index = 0;
+}
-    public:
+template <typename T>
-    std::vector<size_t> inLengths;
+class ReduceWithIndexTest : public ::testing::Test
-    std::vector<int> reduceDims;
+{
-    std::vector<float> scales;
+    protected:
+    using InDataType  = std::tuple_element_t<0, T>;
+    using AccDataType = std::tuple_element_t<1, T>;
+    using OutDataType = std::tuple_element_t<2, T>;
-    int data_type;
+    static std::vector<ReduceParam> params;
-    int init_method = 1;
-    public:
+    static void SetUpTestSuite()
-    void show_usage(const char* cmd)
    {
-        std::cout << "Usage of " << cmd << std::endl;
+        // set testcase variables
-        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths "
+        ReduceParam set;
-                     "(only 4-d tensor supported)"
+        const auto setReduceDim = SetGenericReduceDim();
-                  << std::endl;
-        std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
-                     "(only 1 or 3 or 4 dimensions supported)"
-                  << std::endl;
-        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
-                  << std::endl;
-        std::cout << "Arg1 -- data type (1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
-        std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
-                     "value, 3=decimal value)"
-                  << std::endl;
-    };
-    int processArgs(int argc, char* argv[])
-    {
-        using ck::host_common::getTypeValuesFromString;
-        int ch;
-        while(1)
+        for(std::size_t i(0); i < setReduceDim.size(); ++i)
        {
-            ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index);
+            set.reduceDims = setReduceDim[i];
-            if(ch == -1)
+            params.emplace_back(set);
-                break;
+        }
-            switch(ch)
+    }
-            {
-            case 'D':
+    template <ReduceTensorOp ReduceOpIdType>
-                if(!optarg)
+    void Run()
-                    throw std::runtime_error("Invalid option format!");
+    {
+        for(auto param : this->params)
-                inLengths = getTypeValuesFromString<size_t>(optarg);
-                break;
-            case 'R':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                reduceDims = getTypeValuesFromString<int>(optarg);
-                break;
-            case 'S':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                scales = getTypeValuesFromString<float>(optarg);
-                break;
-            case '?':
-                if(std::string(long_options[option_index].name) == "help")
-                {
-                    show_usage(argv[0]);
-                    return (-1);
-                };
-                break;
-            default: show_usage(argv[0]); return (-1);
-            };
-        };
-        if(optind + 2 > argc)
-            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
-        data_type   = std::atoi(argv[optind++]);
-        init_method = std::atoi(argv[optind]);
-        if(scales.empty())
        {
-            scales.push_back(1.0f);
+            bool success = ck::profiler::profile_reduce_impl<InDataType, AccDataType, OutDataType>(
-            scales.push_back(0.0f);
+                param.do_verification,
-        };
+                param.init_method,
+                param.do_dumpout,
+                param.time_kernel,
+                param.inLengths,
+                param.reduceDims,
+                ReduceOpIdType,
+                param.propagateNan,
+                param.useIndex,
+                param.alpha,
+                param.beta);
+            EXPECT_TRUE(success);
+        }
+    }
+};
-        if(inLengths.size() != 4 ||
+template <typename T>
-           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
+std::vector<ReduceParam> ReduceWithIndexTest<T>::params = {};
-            return (-1);
-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+using Reduce_float_types       = ::testing::Types<std::tuple<float, float, float>>;
-            return (-1);
+using Reduce_double_types      = ::testing::Types<std::tuple<double, double, double>>;
+using Reduce_int8t_types       = ::testing::Types<std::tuple<int8_t, int8_t, int8_t>>;
+using Reduce_half_types        = ::testing::Types<std::tuple<ck::half_t, ck::half_t, ck::half_t>>;
+using Reduce_bhalf_float_Types = ::testing::Types<std::tuple<ck::bhalf_t, float, ck::bhalf_t>>;
-        return (0);
+template <typename TType>
-    };
+class ReduceWithIndexFloat : public ReduceWithIndexTest<TType>
+{
 };
-bool test_reduce_with_index(int data_type,
+template <typename TType>
-                            int init_method,
+class ReduceWithIndexDouble : public ReduceWithIndexTest<TType>
-                            std::vector<int> reduceDims,
-                            std::vector<size_t> inLengths,
-                            ReduceTensorOp reduceOpId,
-                            bool propagateNan,
-                            float alpha,
-                            float beta)
 {
-    using ck::profiler::profile_reduce_impl;
+};
-    bool result = true;
+template <typename TType>
+class ReduceWithIndexInt8 : public ReduceWithIndexTest<TType>
+{
+};
-    if(data_type == 0)
+template <typename TType>
-    {
+class ReduceWithIndexHalf : public ReduceWithIndexTest<TType>
-        result = profile_reduce_impl<float, float, float>(true,
+{
-                                                          init_method,
+};
-                                                          false,
-                                                          false,
-                                                          inLengths,
-                                                          reduceDims,
-                                                          reduceOpId,
-                                                          propagateNan,
-                                                          true,
-                                                          alpha,
-                                                          beta);
-    }
-    else if(data_type == 1)
-    {
-        result = profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(true,
-                                                                         init_method,
-                                                                         false,
-                                                                         false,
-                                                                         inLengths,
-                                                                         reduceDims,
-                                                                         reduceOpId,
-                                                                         propagateNan,
-                                                                         true,
-                                                                         alpha,
-                                                                         beta);
-    }
-    else if(data_type == 3)
-    {
-        result = profile_reduce_impl<int8_t, int8_t, int8_t>(true,
-                                                             init_method,
-                                                             false,
-                                                             false,
-                                                             inLengths,
-                                                             reduceDims,
-                                                             reduceOpId,
-                                                             propagateNan,
-                                                             true,
-                                                             alpha,
-                                                             beta);
-    }
-    else if(data_type == 5)
-    {
-        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
-                                                                      init_method,
-                                                                      false,
-                                                                      false,
-                                                                      inLengths,
-                                                                      reduceDims,
-                                                                      reduceOpId,
-                                                                      propagateNan,
-                                                                      true,
-                                                                      alpha,
-                                                                      beta);
-    }
-    else if(data_type == 6)
-    {
-        result = profile_reduce_impl<double, double, double>(true,
-                                                             init_method,
-                                                             false,
-                                                             false,
-                                                             inLengths,
-                                                             reduceDims,
-                                                             reduceOpId,
-                                                             propagateNan,
-                                                             true,
-                                                             alpha,
-                                                             beta);
-    }
-    return (result);
+template <typename TType>
+class ReduceWithIndexBHalfFloat : public ReduceWithIndexTest<TType>
+{
 };
-constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AMAX;
+TYPED_TEST_SUITE(ReduceWithIndexFloat, Reduce_float_types);
-constexpr bool propagateNan         = false;
+TYPED_TEST_SUITE(ReduceWithIndexDouble, Reduce_double_types);
+TYPED_TEST_SUITE(ReduceWithIndexInt8, Reduce_int8t_types);
+TYPED_TEST_SUITE(ReduceWithIndexHalf, Reduce_half_types);
+TYPED_TEST_SUITE(ReduceWithIndexBHalfFloat, Reduce_bhalf_float_Types);
-int main(int argc, char* argv[])
+TYPED_TEST(ReduceWithIndexFloat, ReduceWithIndexTestFloat_AMAX)
 {
-    SimpleAppArgs args;
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
-    bool result = true;
+TYPED_TEST(ReduceWithIndexFloat, ReduceWithIndexTestFloat_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
-    if(argc == 1)
+TYPED_TEST(ReduceWithIndexFloat, ReduceWithIndexTestFloat_MAX)
-    {
+{
-        int data_type   = 1;
+    // trigger Run() -> Generic
-        int init_method = 2;
+    this->template Run<ReduceTensorOp::MAX>();
-        std::vector<size_t> inLengths{64, 4, 280, 80};
+}
-        std::vector<std::vector<int>> v_reduceDims{
-            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
+TYPED_TEST(ReduceWithIndexDouble, ReduceWithIndexTestDouble_AMAX)
+{
-        for(auto& reduceDims : v_reduceDims)
+    // trigger Run() -> Generic
-            result = result && test_reduce_with_index(data_type,
+    this->template Run<ReduceTensorOp::AMAX>();
-                                                      init_method,
+}
-                                                      reduceDims,
-                                                      inLengths,
+TYPED_TEST(ReduceWithIndexDouble, ReduceWithIndexTestDouble_MIN)
-                                                      reduceOpId,
+{
-                                                      propagateNan,
+    // trigger Run() -> Generic
-                                                      1.0f,
+    this->template Run<ReduceTensorOp::MIN>();
-                                                      0.0f);
+}
-    }
-    else
+TYPED_TEST(ReduceWithIndexDouble, ReduceWithIndexTestDouble_MAX)
-    {
+{
-        if(args.processArgs(argc, argv) < 0)
+    // trigger Run() -> Generic
-        {
+    this->template Run<ReduceTensorOp::MAX>();
-            throw std::runtime_error(
+}
-                "Invalid input arguments, test_reduce_with_index could not be executed!");
-        };
+TYPED_TEST(ReduceWithIndexInt8, ReduceWithIndexTestInt8_AMAX)
+{
-        result = test_reduce_with_index(args.data_type,
+    // trigger Run() -> Generic
-                                        args.init_method,
+    this->template Run<ReduceTensorOp::AMAX>();
-                                        args.reduceDims,
+}
-                                        args.inLengths,
-                                        reduceOpId,
+TYPED_TEST(ReduceWithIndexInt8, ReduceWithIndexTestInt8_MIN)
-                                        propagateNan,
+{
-                                        args.scales[0],
+    // trigger Run() -> Generic
-                                        args.scales[1]);
+    this->template Run<ReduceTensorOp::MIN>();
-    }
+}
-    std::cout << "test_reduce_with_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl;
+TYPED_TEST(ReduceWithIndexInt8, ReduceWithIndexTestInt8_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
+TYPED_TEST(ReduceWithIndexHalf, ReduceWithIndexTestHalf_AMAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
+TYPED_TEST(ReduceWithIndexHalf, ReduceWithIndexTestHalf_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
+TYPED_TEST(ReduceWithIndexHalf, ReduceWithIndexTestHalf_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
+}
+TYPED_TEST(ReduceWithIndexBHalfFloat, ReduceWithIndexTesBtHalfFloat_AMAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::AMAX>();
+}
-    return (result ? 0 : -1);
+TYPED_TEST(ReduceWithIndexBHalfFloat, ReduceWithIndexTestBHalfFloat_MIN)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MIN>();
+}
+TYPED_TEST(ReduceWithIndexBHalfFloat, ReduceWithIndexTestBHalfFloat_MAX)
+{
+    // trigger Run() -> Generic
+    this->template Run<ReduceTensorOp::MAX>();
 }
--- a/test/smfmac_op/smfmac_op_xdl.cpp
+++ b/test/smfmac_op/smfmac_op_xdl.cpp
@@ -13,6 +13,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "test/smfmac_op/smfmac_op_util.hpp"
+#include "ck/host_utility/device_prop.hpp"
 using BF16        = ck::bhalf_t;
 using F16         = ck::half_t;
@@ -38,40 +39,43 @@ class TestSmfmac : public ::testing::Test
    void Run()
    {
-        bool pass                     = true;
+        bool pass = true;
-        constexpr auto matmul_default = ck::smfmac_op_util::matmul<Src1Type,
+        if(ck::get_device_name() == "gfx942")
-                                                                   Src1VecSize,
+        {
-                                                                   Src2Type,
+            constexpr auto matmul_default = ck::smfmac_op_util::matmul<Src1Type,
-                                                                   Src2VecSize,
+                                                                       Src1VecSize,
-                                                                   GPUAccType,
+                                                                       Src2Type,
-                                                                   AccVecSize,
+                                                                       Src2VecSize,
-                                                                   DstType,
+                                                                       GPUAccType,
-                                                                   M,
+                                                                       AccVecSize,
-                                                                   N,
+                                                                       DstType,
-                                                                   K>;
+                                                                       M,
+                                                                       N,
+                                                                       K>;
-        constexpr auto smfmac_kernel_container = std::make_tuple(matmul_default);
+            constexpr auto smfmac_kernel_container = std::make_tuple(matmul_default);
-        ck::static_for<0, std::tuple_size_v<decltype(smfmac_kernel_container)>, 1>{}([&](auto i) {
-            pass &= ck::smfmac_op_util::TestSmfmac<
-                std::tuple_element_t<i.value, decltype(smfmac_kernel_container)>,
-                Src1Type,
-                Src2Type,
-                DstType,
-                GPUAccType,
-                CPUAccType,
-                decltype(Row{}),
-                decltype(Row{}),
-                decltype(Row{}),
-                PassThrough,
-                PassThrough,
-                PassThrough,
-                AccVecSize,
-                M,
-                N,
-                K>{}(std::get<ck::Number<i>{}>(smfmac_kernel_container));
-        });
+            ck::static_for<0, std::tuple_size_v<decltype(smfmac_kernel_container)>, 1>{}(
+                [&](auto i) {
+                    pass &= ck::smfmac_op_util::TestSmfmac<
+                        std::tuple_element_t<i.value, decltype(smfmac_kernel_container)>,
+                        Src1Type,
+                        Src2Type,
+                        DstType,
+                        GPUAccType,
+                        CPUAccType,
+                        decltype(Row{}),
+                        decltype(Row{}),
+                        decltype(Row{}),
+                        PassThrough,
+                        PassThrough,
+                        PassThrough,
+                        AccVecSize,
+                        M,
+                        N,
+                        K>{}(std::get<ck::Number<i>{}>(smfmac_kernel_container));
+                });
+        }
        EXPECT_TRUE(pass);
    }
 };