Merge remote-tracking branch 'origin/develop' into stream-k-initial-impl

f74b77bc · carlushuang · b5be51ed · 0d911822 · b5be51ed · b5be51ed
Commit f74b77bc authored Jun 19, 2023 by carlushuang
20 changed files
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 3;
-void add_device_softmax_i8_i8_rank3_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 4;
-void add_device_softmax_i8_i8_rank4_reduce1_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 4;
-void add_device_softmax_i8_i8_rank4_reduce2_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 4;
-void add_device_softmax_i8_i8_rank4_reduce3_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#include <vector>
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
-#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-static constexpr index_t RANK = 4;
-void add_device_softmax_i8_i8_rank4_reduce4_instances(
-    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
-{
-    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
-}
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/profiler/include/profiler/profile_softmax_impl.hpp
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -40,7 +40,11 @@ template <> std::string type_to_string<int8_t>()  { return "int8"; }
 template <> std::string type_to_string<int32_t>() { return "int32"; }
 // clang-format on
-template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim>
 bool profile_softmax_impl(int do_verification,
                          int init_method,
                          bool do_log,
@@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification,
    if(Rank != in_length.size())
    {
        throw std::runtime_error("Input tensor rank is different from template argument Rank!");
-    }
+    };
+    if(NumReduceDim != reduce_dims.size())
+    {
+        throw std::runtime_error(
+            "Input reduce_dims rank is different from template argument NumReduceDim!");
+    };
    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
                                               : Tensor<InDataType>(in_length, in_strides);
@@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification,
    // add device softmax instances
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using DeviceOp    = tensor_operation::device::
+    using DeviceOp    = tensor_operation::device::DeviceSoftmax<InDataType,
-        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             Rank,
+                                                             NumReduceDim>;
    // get device op instances
    const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification,
    for(auto& inst_ptr : instances)
    {
-        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
-        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
-        if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
-        {
-            continue;
-        }
        auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
                                                          in_tensor_strides,
                                                          reduce_dims,

--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
    {
        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
+            if(reduce.size() == 1)
-                                                                                 init_method,
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 1>(
-                                                                                 do_log,
+                    do_verification,
-                                                                                 time_kernel,
+                    init_method,
-                                                                                 length,
+                    do_log,
-                                                                                 stride,
+                    time_kernel,
-                                                                                 reduce,
+                    length,
-                                                                                 double(alpha),
+                    stride,
-                                                                                 double(beta));
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
+            if(reduce.size() == 1)
-                                                                       init_method,
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 1>(do_verification,
-                                                                       do_log,
+                                                                              init_method,
-                                                                       time_kernel,
+                                                                              do_log,
-                                                                       length,
+                                                                              time_kernel,
-                                                                       stride,
+                                                                              length,
-                                                                       reduce,
+                                                                              stride,
-                                                                       double(alpha),
+                                                                              reduce,
-                                                                       double(beta));
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 3, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else
        {
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
    {
        if(data_type == SoftmaxDataType::F16_F16)
        {
-            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
+            if(reduce.size() == 1)
-                                                                                 init_method,
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 1>(
-                                                                                 do_log,
+                    do_verification,
-                                                                                 time_kernel,
+                    init_method,
-                                                                                 length,
+                    do_log,
-                                                                                 stride,
+                    time_kernel,
-                                                                                 reduce,
+                    length,
-                                                                                 double(alpha),
+                    stride,
-                                                                                 double(beta));
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 2>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 3>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 4>(
+                    do_verification,
+                    init_method,
+                    do_log,
+                    time_kernel,
+                    length,
+                    stride,
+                    reduce,
+                    double(alpha),
+                    double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else if(data_type == SoftmaxDataType::F32_F32)
        {
-            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
+            if(reduce.size() == 1)
-                                                                       init_method,
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 1>(do_verification,
-                                                                       do_log,
+                                                                              init_method,
-                                                                       time_kernel,
+                                                                              do_log,
-                                                                       length,
+                                                                              time_kernel,
-                                                                       stride,
+                                                                              length,
-                                                                       reduce,
+                                                                              stride,
-                                                                       double(alpha),
+                                                                              reduce,
-                                                                       double(beta));
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 2)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 2>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 3)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 3>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else if(reduce.size() == 4)
+                ck::profiler::profile_softmax_impl<float, float, float, 4, 4>(do_verification,
+                                                                              init_method,
+                                                                              do_log,
+                                                                              time_kernel,
+                                                                              length,
+                                                                              stride,
+                                                                              reduce,
+                                                                              double(alpha),
+                                                                              double(beta));
+            else
+                throw std::runtime_error("invalid number of dimensions to reduce");
        }
        else
        {

--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
   target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
   target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
@@ -14,4 +17,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
   add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
   target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
   target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_batched_gemm_gemm)
   add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
   target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
   add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/batched_gemm_reduce/CMakeLists.txt
+++ b/test/batched_gemm_reduce/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_batched_gemm_softmax_gemm)
   add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
   target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
   add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_batched_gemm_softmax_gemm_permute)
   add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
@@ -14,4 +17,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
   target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/contraction/CMakeLists.txt
+++ b/test/contraction/CMakeLists.txt
 add_gtest_executable(test_contraction test_contraction.cpp)
 target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
    target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/convnd_bwd_data/CMakeLists.txt
+++ b/test/convnd_bwd_data/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_gtest_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
   target_link_libraries(test_convnd_bwd_data PRIVATE utility device_conv1d_bwd_data_instance device_conv2d_bwd_data_instance device_conv3d_bwd_data_instance)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/convnd_fwd/CMakeLists.txt
+++ b/test/convnd_fwd/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_gtest_executable(test_convnd_fwd convnd_fwd.cpp)
   target_link_libraries(test_convnd_fwd PRIVATE utility device_conv2d_fwd_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/gemm_layernorm/CMakeLists.txt
+++ b/test/gemm_layernorm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_gemm_layernorm)
   add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp)
   target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
   add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/gemm_split_k/CMakeLists.txt
+++ b/test/gemm_split_k/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_gtest_executable(test_gemm_splitk test_gemm_splitk.cpp)
   target_link_libraries(test_gemm_splitk PRIVATE utility device_gemm_splitk_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_gtest_executable(test_grouped_convnd_bwd_weight grouped_convnd_bwd_weight.cpp)
-    target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
+   target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/grouped_gemm/CMakeLists.txt
+++ b/test/grouped_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_grouped_gemm)
   add_gtest_executable(test_grouped_gemm_splitk test_grouped_gemm_splitk.cpp)
   add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface.cpp)
@@ -6,4 +9,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
   target_link_libraries(test_grouped_gemm_interface PRIVATE utility device_grouped_gemm_instance)
   add_dependencies(test_grouped_gemm test_grouped_gemm_splitk test_grouped_gemm_interface)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/softmax/test_softmax_rank3.cpp
+++ b/test/softmax/test_softmax_rank3.cpp
@@ -13,7 +13,6 @@ using I = ck::Number<N>;
 using F16 = ck::half_t;
 using F32 = float;
-using I8  = int8_t;
 template <typename Tuple>
 class TestSoftmax : public ck::TestSoftmax<Tuple>
@@ -24,8 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
 using KernelTypes = ::testing::Types<
    //         InDataType, AccDataType, OutDataType, Rank
    std::tuple<       F16,         F32,         F16,    I<3>>,
-    std::tuple<       F32,         F32,         F32,    I<3>>,
+    std::tuple<       F32,         F32,         F32,    I<3>>
-    std::tuple<        I8,         F32,          I8,    I<3>>
    >;
 // clang-format on