Merge branch 'amd-develop' into amd-master

5683ea4e · Jun Liu · f0831350 · dddc2115 · 5683ea4e · 5683ea4e
Commit 5683ea4e authored Aug 08, 2023 by Jun Liu
19 changed files
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -60,6 +60,6 @@ add_subdirectory(contraction)
 add_subdirectory(pool_fwd)
 add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
-if(GPU_TARGETS MATCHES "gfx1100")
+if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
@@ -2,21 +2,26 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-   target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
+      add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
-   target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
+      target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
+      target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
-   add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
+   endif()
-   target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
+   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-   target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
+      add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
+      target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
-   add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
+      target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
-   target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
+   endif()
-   target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
+   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+      add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
-   add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
+      target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
-   target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
+      target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
-   target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
+   endif()
+   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+      add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
+      target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
+      target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
+   endif()
   set(target 1)
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_custom_target(test_batched_gemm_gemm)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-   add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
+      add_custom_target(test_batched_gemm_gemm)
-   target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
+      add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
-   add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
+      target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
-   set(target 1)
+      add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
+      set(target 1)
+   endif()
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batched_gemm_multi_d/CMakeLists.txt
+++ b/test/batched_gemm_multi_d/CMakeLists.txt
-# TODO: Enable for gfx90a after complier fix
 if(DL_KERNELS)
-  add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
+    add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
-  target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
+    target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
 endif()
--- a/test/batched_gemm_reduce/CMakeLists.txt
+++ b/test/batched_gemm_reduce/CMakeLists.txt
@@ -2,9 +2,11 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
+     add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
-   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
+     target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
-   set(target 1)
+     target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
+     set(target 1)
+   endif()
 endif()
 endforeach()
--- a/test/batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_custom_target(test_batched_gemm_softmax_gemm)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-   add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
+     add_custom_target(test_batched_gemm_softmax_gemm)
-   target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
+     add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
-   add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
+     target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
-   set(target 1)
+     add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
+     set(target 1)
+   endif()
 endif()
 endforeach()
\ No newline at end of file
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
@@ -2,21 +2,25 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_custom_target(test_batched_gemm_softmax_gemm_permute)
+   if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
+     add_custom_target(test_batched_gemm_softmax_gemm_permute)
-   add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
+   endif()
-   add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
+   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-   target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
-   target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
-   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
+     target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
+     target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
-   add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp)
+     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
-   add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
+   endif()
-   target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-   target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
-   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
+     add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp)
-   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
+     target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
+     add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
+   endif()
   set(target 1)
 endif()
 endforeach()
\ No newline at end of file
--- a/test/elementwise_normalization/CMakeLists.txt
+++ b/test/elementwise_normalization/CMakeLists.txt
-add_custom_target(test_elementwise_normalization)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  add_custom_target(test_elementwise_normalization)
-add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp)
+  add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp)
+  target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance)
-target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance)
+  add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16)
+endif()
-add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16)
\ No newline at end of file
--- a/test/gemm_layernorm/CMakeLists.txt
+++ b/test/gemm_layernorm/CMakeLists.txt
@@ -2,10 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
 if(gpu IN_LIST gpu_list AND target EQUAL 0)
+  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
   add_custom_target(test_gemm_layernorm)
   add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp)
   target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
   add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
   set(target 1)
+  endif()
 endif()
 endforeach()
--- a/test/gemm_reduce/CMakeLists.txt
+++ b/test/gemm_reduce/CMakeLists.txt
-add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility)
+  add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp)
-target_link_libraries(test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance)
+  target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility)
+  target_link_libraries(test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance)
+endif()
\ No newline at end of file
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_interface.cpp
@@ -70,10 +70,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
                conv_param);
-        std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> input_lengths{};
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> output_lengths{};
        std::array<ck::index_t, NDimSpatial + 3> input_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> weights_strides{};
        std::array<ck::index_t, NDimSpatial + 3> output_strides{};
        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
@@ -82,10 +83,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
        auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
-        range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
+        range_copy(in_g_n_c_wis_desc.GetLengths(), begin(input_lengths));
-        range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
-        range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
        range_copy(in_g_n_c_wis_desc.GetStrides(), begin(input_strides));
+        range_copy(wei_g_k_c_xs_desc.GetLengths(), begin(filter_lengths));
+        range_copy(wei_g_k_c_xs_desc.GetStrides(), begin(weights_strides));
+        range_copy(out_g_n_k_wos_desc.GetLengths(), begin(output_lengths));
        range_copy(out_g_n_k_wos_desc.GetStrides(), begin(output_strides));
        range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
        range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
@@ -97,14 +99,11 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
        auto argument = conv.MakeArgument(nullptr,
                                          nullptr,
                                          nullptr,
-                                          conv_param.G_,
+                                          input_lengths,
-                                          conv_param.N_,
-                                          conv_param.K_,
-                                          conv_param.C_,
-                                          input_spatial_lengths,
-                                          filter_spatial_lengths,
-                                          output_spatial_lengths,
                                          input_strides,
+                                          filter_lengths,
+                                          weights_strides,
+                                          output_lengths,
                                          output_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,

--- a/test/grouped_gemm/CMakeLists.txt
+++ b/test/grouped_gemm/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
@@ -12,3 +13,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
   set(target 1)
 endif()
 endforeach()
+endif()
--- a/test/normalization/CMakeLists.txt
+++ b/test/normalization/CMakeLists.txt
-add_custom_target(test_normalization)
+if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+  add_custom_target(test_normalization)
-add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp)
+endif()
-add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp)
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp)
+  add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp)
-add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp)
+  add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp)
+  target_link_libraries(test_layernorm2d_fp32 PRIVATE utility device_normalization_instance)
-target_link_libraries(test_layernorm2d_fp32 PRIVATE utility device_normalization_instance)
+  target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance)
-target_link_libraries(test_layernorm2d_fp16 PRIVATE utility device_normalization_instance)
+  add_dependencies(test_normalization test_layernorm2d_fp32)
-target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance)
+  add_dependencies(test_normalization test_groupnorm_fp32)
-target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance)
+endif()
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-add_dependencies(test_normalization test_layernorm2d_fp32)
+  add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp)
-add_dependencies(test_normalization test_layernorm2d_fp16)
+  add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp)
-add_dependencies(test_normalization test_groupnorm_fp16)
+  target_link_libraries(test_layernorm2d_fp16 PRIVATE utility device_normalization_instance)
-add_dependencies(test_normalization test_groupnorm_fp32)
+  target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance)
+  add_dependencies(test_normalization test_layernorm2d_fp16)
+  add_dependencies(test_normalization test_groupnorm_fp16)
+endif()
--- a/test/pool_fwd/test_avg_pool2d_fwd.cpp
+++ b/test/pool_fwd/test_avg_pool2d_fwd.cpp
@@ -41,8 +41,12 @@ class TestAvgPool2dFwd : public ::testing::Test
    }
 };
+#ifdef __fp16__
 using KernelTypes =
    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
+#else
+using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
+#endif
 TYPED_TEST_SUITE(TestAvgPool2dFwd, KernelTypes);
 TYPED_TEST(TestAvgPool2dFwd, Test_Pool)

--- a/test/pool_fwd/test_avg_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_avg_pool3d_fwd.cpp
@@ -40,10 +40,12 @@ class TestAvgPool3dFwd : public ::testing::Test
        }
    }
 };
+#ifdef __fp16__
 using KernelTypes =
    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
+#else
+using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
+#endif
 TYPED_TEST_SUITE(TestAvgPool3dFwd, KernelTypes);
 TYPED_TEST(TestAvgPool3dFwd, Test_Pool)
 {

--- a/test/pool_fwd/test_max_pool2d_fwd.cpp
+++ b/test/pool_fwd/test_max_pool2d_fwd.cpp
@@ -59,10 +59,12 @@ class TestMaxPool2dFwd : public ::testing::Test
        }
    }
 };
+#ifdef __fp16__
 using KernelTypes =
    ::testing::Types<std::tuple<F16, F16, F16, I32>, std::tuple<F32, F32, F32, I32>>;
+#else
+using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
+#endif
 TYPED_TEST_SUITE(TestMaxPool2dFwd, KernelTypes);
 TYPED_TEST(TestMaxPool2dFwd, Test_Pool)
 {

--- a/test/pool_fwd/test_max_pool3d_fwd.cpp
+++ b/test/pool_fwd/test_max_pool3d_fwd.cpp
@@ -60,8 +60,12 @@ class TestMaxPool3dFwd : public ::testing::Test
    }
 };
+#ifdef __fp16__
 using KernelTypes =
-    ::testing::Types<std::tuple<F16, F16, F16, I32>, std::tuple<F32, F32, F32, I32>>;
+    ::testing::Types<std::tuple<F16, F16, F32, I32>, std::tuple<F32, F32, F32, I32>>;
+#else
+using KernelTypes = ::testing::Types<std::tuple<F32, F32, F32, I32>>;
+#endif
 TYPED_TEST_SUITE(TestMaxPool3dFwd, KernelTypes);
 TYPED_TEST(TestMaxPool3dFwd, Test_Pool)

--- a/test/softmax/test_softmax_rank3.cpp
+++ b/test/softmax/test_softmax_rank3.cpp
@@ -10,8 +10,9 @@
 template <ck::index_t N>
 using I = ck::Number<N>;
+#ifdef __fp16__
 using F16 = ck::half_t;
+#endif
 using F32 = float;
 template <typename Tuple>
@@ -22,7 +23,9 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
    //         InDataType, AccDataType, OutDataType, Rank
+#ifdef __fp16__
    std::tuple<       F16,         F32,         F16,    I<3>>,
+#endif
    std::tuple<       F32,         F32,         F32,    I<3>>
    >;
 // clang-format on

--- a/test/softmax/test_softmax_rank4.cpp
+++ b/test/softmax/test_softmax_rank4.cpp
@@ -10,8 +10,9 @@
 template <ck::index_t N>
 using I = ck::Number<N>;
+#ifdef __fp16__
 using F16 = ck::half_t;
+#endif
 using F32 = float;
 template <typename Tuple>
@@ -22,7 +23,9 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
    //         InDataType, AccDataType, OutDataType, Rank
+#ifdef __fp16__
    std::tuple<       F16,         F32,         F16,    I<4>>,
+#endif
    std::tuple<       F32,         F32,         F32,    I<4>>
    >;
 // clang-format on