Merge branch 'develop' into add_int8_wmma_example_instance

9f8ab221 · zjing14 · GitHub · 755ace59 · b4fc4d0b · 9f8ab221
Unverified Commit 9f8ab221 authored Oct 19, 2023 by zjing14 Committed by GitHub Oct 19, 2023
20 changed files
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -20,9 +20,11 @@ enum struct ConvLayout

 enum struct ConvDataType
 {
-    F32_F32_F32,   // 0
-    F16_F16_F16,   // 1
-    BF16_F32_BF16, // 2
+    F32_F32_F32,        // 0
+    F16_F16_F16,        // 1
+    BF16_F32_BF16,      // 2
+    F16_F16_F16_BF8_F8, // 3
+    I8_I8_I8            // 4
 };

 #define OP_NAME "grouped_conv_bwd_weight"
@@ -33,7 +35,9 @@ static void print_helper_msg()
    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
              << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
              << "                 1: Input fp16, Weight fp16, Output fp16\n"
-              << "                 2: Input bf16, Weight fp32, Output bf16)\n"
+              << "                 2: Input bf16, Weight fp32, Output bf16\n"
+              << "                 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n"
+              << "                 4: Input int8, Weight int8, Output int8)\n"
              << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
                 "N, K, Ho, Wo]\n"
              << "                     1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
@@ -82,6 +86,8 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
    using F32  = float;
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
+    using F8   = ck::f8_t;
+    using BF8  = ck::bf8_t;

    using namespace ck::tensor_layout::convolution;

@@ -95,7 +101,9 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
                       auto out_layout,
                       auto in_type,
                       auto wei_type,
-                       auto out_type) {
+                       auto out_type,
+                       auto compute_type_a,
+                       auto compute_type_b) {
        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;

        using InLayout  = decltype(in_layout);
@@ -106,13 +114,18 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
        using WeiDataType = decltype(wei_type);
        using OutDataType = decltype(out_type);

+        using ComputeTypeA = decltype(compute_type_a);
+        using ComputeTypeB = decltype(compute_type_b);
+
        bool pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
                                                                       InLayout,
                                                                       WeiLayout,
                                                                       OutLayout,
                                                                       InDataType,
                                                                       WeiDataType,
-                                                                       OutDataType>(
+                                                                       OutDataType,
+                                                                       ComputeTypeA,
+                                                                       ComputeTypeB>(
            do_verification, init_method, do_log, time_kernel, params, split_k);

        return pass ? 0 : 1;
@@ -122,80 +135,94 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{});
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{});
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{});
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::I8_I8_I8)
+        {
+            return profile(
+                I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{});
        }
    }
-    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{});
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
-        else if(data_type == ConvDataType::F16_F16_F16)
+        if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{});
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, F16{}, F16{});
        }
-        else if(data_type == ConvDataType::BF16_F32_BF16)
+        if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{});
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
+        }
+        if(data_type == ConvDataType::F16_F16_F16_BF8_F8)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{}, BF8{}, F8{});
+        }
+        else if(data_type == ConvDataType::I8_I8_I8)
+        {
+            return profile(
+                I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{});
        }
    }


--- a/profiler/src/profile_groupnorm.cpp
+++ b/profiler/src/profile_groupnorm.cpp
@@ -93,12 +93,12 @@ int profile_groupnorm(int argc, char* argv[])

    if(data_type == ck::DataTypeEnum::Float)
    {
-        ck::profiler::profile_groupnorm_impl<F32, F32, F32, F32, F32>(
+        ck::profiler::profile_groupnorm_impl<F32, F32, F32, F32, F32, F32, false>(
            do_verification, init_method, do_log, time_kernel, length);
    }
    else if(data_type == ck::DataTypeEnum::Half)
    {
-        ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16>(
+        ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16, F32, false>(
            do_verification, init_method, do_log, time_kernel, length);
    }
    else

--- a/profiler/src/profile_layernorm.cpp
+++ b/profiler/src/profile_layernorm.cpp
@@ -82,12 +82,12 @@ int profile_layernorm(int argc, char* argv[])

    if(data_type == ck::DataTypeEnum::Half)
    {
-        ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(
+        ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F32, false, rank>(
            do_verification, init_method, do_log, time_kernel, length);
    }
    else if(data_type == ck::DataTypeEnum::Float)
    {
-        ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(
+        ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, F32, false, rank>(
            do_verification, init_method, do_log, time_kernel, length);
    }
    else

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -9,26 +9,121 @@ add_custom_target(tests)

 function(add_test_executable TEST_NAME)
    message("adding test ${TEST_NAME}")
-    add_executable(${TEST_NAME} ${ARGN})
-    add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>)
-    add_dependencies(tests ${TEST_NAME})
-    add_dependencies(check ${TEST_NAME})
-    rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
+    set(result 1)
+    if(DEFINED DTYPES)
+        foreach(source IN LISTS ARGN)
+            set(test 0)
+            foreach(type IN LISTS DTYPES)
+                if(type MATCHES "fp16")
+                    set(type1 "_f16")
+                elseif(type MATCHES "fp32")
+                    set(type1 "_f32")
+                elseif(type MATCHES "fp8")
+                    set(type1 "_f8")
+                elseif(type MATCHES "bf16")
+                    set(type1 "_b16")
+                elseif(type MATCHES "fp64")
+                    set(type1 "_f64")
+                elseif(type MATCHES "int8")
+                    set(type1 "_i8")
+                endif()
+                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+                    #if filename matches any selected type, exit type loop and do no exclude the file from the list
+                    set(test 0)
+                    break()
+                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+                    source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
+                    NOT(source MATCHES type OR source MATCHES type1))
+                    #if filename contains a type which doesn't match any selected type, mark it for removal
+                    set(test 1)
+                endif()
+            endforeach()
+            if(test EQUAL 1)
+                message("removing test ${source} ")
+                list(REMOVE_ITEM ARGN "${source}")
+            endif()
+        endforeach()
+    endif()
+    foreach(source IN LISTS ARGN)
+        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+            message("removing dl test ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
+    endforeach()
+
+    #only continue if there are some source files left on the list
+    if(ARGN)
+        add_executable(${TEST_NAME} ${ARGN})
+        add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>)
+        add_dependencies(tests ${TEST_NAME})
+        add_dependencies(check ${TEST_NAME})
+        rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
+        set(result 0)
+    endif()
+    #message("add_test returns ${result}")
+    set(result ${result} PARENT_SCOPE)
 endfunction(add_test_executable TEST_NAME)

 include(GoogleTest)

 function(add_gtest_executable TEST_NAME)
    message("adding gtest ${TEST_NAME}")
-    add_executable(${TEST_NAME} ${ARGN})
-    add_dependencies(tests ${TEST_NAME})
-    add_dependencies(check ${TEST_NAME})
+    set(result 1)
+    if(DEFINED DTYPES)
+        foreach(source IN LISTS ARGN)
+            set(test 0)
+            foreach(type IN LISTS DTYPES)
+                if(type MATCHES "fp16")
+                    set(type1 "_f16")
+                elseif(type MATCHES "fp32")
+                    set(type1 "_f32")
+                elseif(type MATCHES "fp8")
+                    set(type1 "_f8")
+                elseif(type MATCHES "bf16")
+                    set(type1 "_b16")
+                elseif(type MATCHES "fp64")
+                    set(type1 "_f64")
+                elseif(type MATCHES "int8")
+                    set(type1 "_i8")
+                endif()
+                if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}")
+                    #if filename matches any selected type, exit type loop and do no exclude the file from the list
+                    set(test 0)
+                    break()
+                elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR
+                    source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND
+                    NOT(source MATCHES type OR source MATCHES type1))
+                    #if filename contains a type which doesn't match any selected type, mark it for removal
+                    set(test 1)
+                endif()
+            endforeach()
+            if(test EQUAL 1)
+                message("removing gtest ${source} ")
+                list(REMOVE_ITEM ARGN "${source}")
+            endif()
+        endforeach()
+    endif()
+    foreach(source IN LISTS ARGN)
+        if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
+            message("removing dl test ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
+    endforeach()
+    #only continue if there are some source files left on the list
+    if(ARGN)
+        add_executable(${TEST_NAME} ${ARGN})
+        add_dependencies(tests ${TEST_NAME})
+        add_dependencies(check ${TEST_NAME})

-    # suppress gtest warnings
-    target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
-    target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
-    add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>)
-    rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
+        # suppress gtest warnings
+        target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
+        target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
+        add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>)
+        rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
+        set(result 0)
+    endif()
+    #message("add_gtest returns ${result}")
+    set(result ${result} PARENT_SCOPE)
 endfunction(add_gtest_executable TEST_NAME)

 add_subdirectory(magic_number_division)
@@ -60,7 +155,7 @@ add_subdirectory(contraction)
 add_subdirectory(pool)
 add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
-add_subdirectory(image_to_column)
+add_subdirectory(conv_tensor_rearrange)
 if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
@@ -2,26 +2,8 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-      add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
-      target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
-      target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
-   endif()
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-      add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
-      target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
-      target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
-   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-      add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
-      target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
-      target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
-   endif()
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-      add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
-      target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
-      target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
-   endif()
+   add_gtest_executable(test_batched_gemm test_batched_gemm.cpp)
+   target_link_libraries(test_batched_gemm PRIVATE utility device_batched_gemm_instance)
   set(target 1)
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-
-#include "profiler/profile_batched_gemm_impl.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
-
-namespace {
-using ADataType = ck::bhalf_t;
-using BDataType = ck::bhalf_t;
-using CDataType = ck::bhalf_t;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-} // namespace
-
-int main()
-{
-    int M          = 256;
-    int N          = 256;
-    int K          = 128;
-    int BatchCount = 3;
-
-    bool pass = true;
-
-    using namespace ck::tensor_operation::device;
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Row,
-                                                           Row,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Row,
-                                                                             Row,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Row,
-                                                           Col,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Row,
-                                                                             Col,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Col,
-                                                           Row,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Col,
-                                                                             Row,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Col,
-                                                           Col,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Col,
-                                                                             Col,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
-
-    std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl;
-    return pass ? 0 : 1;
-}
--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-
-#include "profiler/profile_batched_gemm_impl.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
-
-namespace {
-using ADataType = ck::half_t;
-using BDataType = ck::half_t;
-using CDataType = ck::half_t;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-} // namespace
-
-int main()
-{
-    int M          = 512;
-    int N          = 256;
-    int K          = 128;
-    int BatchCount = 3;
-
-    bool pass = true;
-
-    using namespace ck::tensor_operation::device;
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Row,
-                                                           Row,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Row,
-                                                                             Row,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Row,
-                                                           Col,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Row,
-                                                                             Col,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Col,
-                                                           Row,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Col,
-                                                                             Row,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Col,
-                                                           Col,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Col,
-                                                                             Col,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
-
-    std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl;
-    return pass ? 0 : 1;
-}
--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-
-#include "profiler/profile_batched_gemm_impl.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
-
-namespace {
-using ADataType = float;
-using BDataType = float;
-using CDataType = float;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-} // namespace
-
-int main()
-{
-    int M          = 256;
-    int N          = 256;
-    int K          = 128;
-    int BatchCount = 3;
-
-    bool pass = true;
-
-    using namespace ck::tensor_operation::device;
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Row,
-                                                           Row,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Row,
-                                                                             Row,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Row,
-                                                           Col,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Row,
-                                                                             Col,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Col,
-                                                           Row,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Col,
-                                                                             Row,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Col,
-                                                           Col,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Col,
-                                                                             Col,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
-
-    std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl;
-    return pass ? 0 : 1;
-}
--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-
-#include "profiler/profile_batched_gemm_impl.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
-
-namespace {
-using ADataType = int8_t;
-using BDataType = int8_t;
-using CDataType = int8_t;
-
-using Row = ck::tensor_layout::gemm::RowMajor;
-using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-} // namespace
-
-int main()
-{
-    int M          = 256;
-    int N          = 256;
-    int K          = 128;
-    int BatchCount = 3;
-
-    bool pass = true;
-
-    using namespace ck::tensor_operation::device;
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Row,
-                                                           Row,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Row,
-                                                                             Row,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Row,
-                                                           Col,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Row,
-                                                                             Col,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Col,
-                                                           Row,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Col,
-                                                                             Row,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
-
-    pass = pass && ck::profiler::profile_batched_gemm_impl<ADataType,
-                                                           BDataType,
-                                                           CDataType,
-                                                           Col,
-                                                           Col,
-                                                           Row,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           PassThrough,
-                                                           DeviceBatchedGemm<Col,
-                                                                             Col,
-                                                                             Row,
-                                                                             ADataType,
-                                                                             BDataType,
-                                                                             CDataType,
-                                                                             PassThrough,
-                                                                             PassThrough,
-                                                                             PassThrough>>(
-                       true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
-
-    std::cout << "test BatchedGEMM int8: " << (pass ? "Pass" : "Fail") << std::endl;
-    return pass ? 0 : 1;
-}
--- a/test/batched_gemm/test_batched_gemm.cpp
+++ b/test/batched_gemm/test_batched_gemm.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
+struct GemmParams
+{
+    ck::index_t M;
+    ck::index_t N;
+    ck::index_t K;
+    ck::index_t BatchCount;
+};
+
+class TestBatchedGemm : public ::testing::Test
+{
+    protected:
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    std::vector<GemmParams> params;
+
+    template <typename DataType>
+    void Run()
+    {
+        using namespace ck::tensor_operation::device;
+
+        bool pass = true;
+        for(auto& param : params)
+        {
+            const auto M          = param.M;
+            const auto N          = param.N;
+            const auto K          = param.K;
+            const auto BatchCount = param.BatchCount;
+
+            pass =
+                pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                DataType,
+                                                                DataType,
+                                                                Row,
+                                                                Row,
+                                                                Row,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DeviceBatchedGemm<Row,
+                                                                                  Row,
+                                                                                  Row,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  PassThrough>>(
+                            true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+
+            pass =
+                pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                DataType,
+                                                                DataType,
+                                                                Row,
+                                                                Col,
+                                                                Row,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DeviceBatchedGemm<Row,
+                                                                                  Col,
+                                                                                  Row,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  PassThrough>>(
+                            true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+
+            pass =
+                pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                DataType,
+                                                                DataType,
+                                                                Col,
+                                                                Row,
+                                                                Row,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DeviceBatchedGemm<Col,
+                                                                                  Row,
+                                                                                  Row,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  PassThrough>>(
+                            true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+
+            pass =
+                pass && ck::profiler::profile_batched_gemm_impl<DataType,
+                                                                DataType,
+                                                                DataType,
+                                                                Col,
+                                                                Col,
+                                                                Row,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                DeviceBatchedGemm<Col,
+                                                                                  Col,
+                                                                                  Row,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  DataType,
+                                                                                  PassThrough,
+                                                                                  PassThrough,
+                                                                                  PassThrough>>(
+                            true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+#ifdef CK_ENABLE_INT8
+TEST_F(TestBatchedGemm, i8)
+{
+    this->params.push_back({64, 64, 64, 2});
+    this->params.push_back({64, 64, 64, 1});
+    this->params.push_back({60, 60, 60, 2});
+    this->params.push_back({68, 68, 68, 2});
+    this->params.push_back({40, 40, 40, 2});
+    this->params.push_back({256, 256, 128, 3});
+    this->template Run<int8_t>();
+}
+#endif
+
+#ifdef CK_ENABLE_BF16
+TEST_F(TestBatchedGemm, bf16)
+{
+    this->params.push_back({64, 64, 64, 2});
+    this->params.push_back({64, 64, 64, 1});
+    this->params.push_back({60, 60, 60, 2});
+    this->params.push_back({68, 68, 68, 2});
+    this->params.push_back({40, 40, 40, 2});
+    this->params.push_back({256, 256, 128, 3});
+    this->template Run<ck::bhalf_t>();
+}
+#endif
+
+#ifdef CK_ENABLE_FP16
+TEST_F(TestBatchedGemm, fp16)
+{
+    this->params.push_back({64, 64, 64, 2});
+    this->params.push_back({64, 64, 64, 1});
+    this->params.push_back({60, 60, 60, 2});
+    this->params.push_back({68, 68, 68, 2});
+    this->params.push_back({40, 40, 40, 2});
+    this->params.push_back({256, 256, 128, 3});
+    this->template Run<ck::half_t>();
+}
+#endif
+
+#ifdef CK_ENABLE_FP32
+TEST_F(TestBatchedGemm, fp32)
+{
+    this->params.push_back({64, 64, 64, 2});
+    this->params.push_back({64, 64, 64, 1});
+    this->params.push_back({60, 60, 60, 2});
+    this->params.push_back({68, 68, 68, 2});
+    this->params.push_back({40, 40, 40, 2});
+    this->params.push_back({256, 256, 128, 3});
+    this->template Run<float>();
+}
+#endif
--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
@@ -2,12 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-      add_custom_target(test_batched_gemm_gemm)
-      add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
+    add_custom_target(test_batched_gemm_gemm)
+    add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
+    if(result EQUAL 0)
      target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
      add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
      set(target 1)
-   endif()
+    endif()
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batched_gemm_multi_d/CMakeLists.txt
+++ b/test/batched_gemm_multi_d/CMakeLists.txt
-if(DL_KERNELS)
-    add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
+add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d_dl.cpp)
+if(result EQUAL 0)
    target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
 endif()
--- a/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
+++ b/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp
--- a/test/batched_gemm_reduce/CMakeLists.txt
+++ b/test/batched_gemm_reduce/CMakeLists.txt
@@ -2,10 +2,9 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-     add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
-     target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
-     target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
+    add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
+    if(result EQUAL 0)
+     target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility device_batched_gemm_reduce_instance)
     set(target 1)
   endif()
 endif()

--- a/test/batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
@@ -2,12 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-     add_custom_target(test_batched_gemm_softmax_gemm)
-     add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
-     target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
-     add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
-     set(target 1)
-   endif()
+    add_custom_target(test_batched_gemm_softmax_gemm)
+    add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
+    if(result EQUAL 0)
+      target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
+      add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
+      set(target 1)
+    endif()
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
@@ -2,25 +2,28 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-     add_custom_target(test_batched_gemm_softmax_gemm_permute)
-   endif()
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-     add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
-     add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
-     target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-     target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
-     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
-   endif()
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-     add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
-     add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
-     target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-     target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
-     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
-   endif()
+    add_custom_target(test_batched_gemm_softmax_gemm_permute)
+    add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
+    if(result EQUAL 0)
+      target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+      add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
+    endif()
+    add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
+    if(result EQUAL 0)
+      target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+      add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
+    endif()
+   
+    add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
+    if(result EQUAL 0)
+      target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+      add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
+    endif()
+    add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
+    if(result EQUAL 0)
+      target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+      add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
+    endif()
   set(target 1)
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -70,10 +70,23 @@ class TestBatchNormBwdRank4 : public ::testing::Test
    }
 };

-using KernelTypes = ::testing::Types<std::tuple<F16, F32, F32, F32, F16, F32, F32>,
-                                     std::tuple<F32, F32, F32, F32, F32, F32, F32>,
-                                     std::tuple<BF16, F32, F32, F32, BF16, F32, F32>,
-                                     std::tuple<F64, F64, F64, F64, F64, F64, F64>>;
+using KernelTypes = ::testing::Types<
+#ifdef CK_ENABLE_FP16
+    std::tuple<F16, F32, F32, F32, F16, F32, F32>
+#endif
+#ifdef CK_ENABLE_FP32
+    ,
+    std::tuple<F32, F32, F32, F32, F32, F32, F32>
+#endif
+#ifdef CK_ENABLE_BF16
+    ,
+    std::tuple<BF16, F32, F32, F32, BF16, F32, F32>
+#endif
+#ifdef CK_ENABLE_FP64
+    ,
+    std::tuple<F64, F64, F64, F64, F64, F64, F64>
+#endif
+    >;

 TYPED_TEST_SUITE(TestBatchNormBwdRank4, KernelTypes);


--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -87,10 +87,23 @@ class TestBatchNormFwdRank4 : public ::testing::Test
    }
 };

-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
-                                     std::tuple<F32, F32, F32, F32, F32, F32>,
-                                     std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
-                                     std::tuple<F64, F64, F64, F64, F64, F64>>;
+using KernelTypes = ::testing::Types<
+#ifdef CK_ENABLE_FP16
+    std::tuple<F16, F16, F32, F16, F16, F32>
+#endif
+#ifdef CK_ENABLE_FP32
+    ,
+    std::tuple<F32, F32, F32, F32, F32, F32>
+#endif
+#ifdef CK_ENABLE_BF16
+    ,
+    std::tuple<BF16, BF16, F32, BF16, BF16, F32>
+#endif
+#ifdef CK_ENABLE_FP64
+    ,
+    std::tuple<F64, F64, F64, F64, F64, F64>
+#endif
+    >;

 TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes);


--- a/test/batchnorm/batchnorm_infer_rank_4.cpp
+++ b/test/batchnorm/batchnorm_infer_rank_4.cpp
@@ -67,10 +67,23 @@ class TestBatchNormInferRank4 : public ::testing::Test
    }
 };

-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
-                                     std::tuple<F32, F32, F32, F32, F32, F32>,
-                                     std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
-                                     std::tuple<F64, F64, F64, F64, F64, F64>>;
+using KernelTypes = ::testing::Types<
+#ifdef CK_ENABLE_FP16
+    std::tuple<F16, F16, F32, F16, F16, F32>
+#endif
+#ifdef CK_ENABLE_FP32
+    ,
+    std::tuple<F32, F32, F32, F32, F32, F32>
+#endif
+#ifdef CK_ENABLE_BF16
+    ,
+    std::tuple<BF16, BF16, F32, BF16, BF16, F32>
+#endif
+#ifdef CK_ENABLE_FP64
+    ,
+    std::tuple<F64, F64, F64, F64, F64, F64>
+#endif
+    >;

 TYPED_TEST_SUITE(TestBatchNormInferRank4, KernelTypes);


--- a/test/contraction/CMakeLists.txt
+++ b/test/contraction/CMakeLists.txt
-add_gtest_executable(test_contraction test_contraction.cpp)
-target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-    add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
-    target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-   set(target 1)
- endif()
+    if(gpu IN_LIST gpu_list AND target EQUAL 0)
+        if((DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64") OR NOT DEFINED DTYPES)
+            add_gtest_executable(test_contraction test_contraction.cpp)
+            target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
+            add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
+            target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
+            set(target 1)
+        endif()
+    endif()
 endforeach()