Enable gfx941 and gfx942 architectures. (#752)

* enable gfx941/942 targets * fix clang format * fix the cmake logic for multiple targets * fix cmake syntax for looping over targets * add gfx941/942 support for gemm_xdl instances

Enable gfx941 and gfx942 architectures. (#752)
* enable gfx941/942 targets * fix clang format * fix the cmake logic for multiple targets * fix cmake syntax for looping over targets * add gfx941/942 support for gemm_xdl instances
027e46ee · Illia Silin · GitHub · 309b1c64 · 027e46ee · 027e46ee
Unverified Commit 027e46ee authored Jun 15, 2023 by Illia Silin Committed by GitHub Jun 15, 2023
16 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -54,7 +54,7 @@ __global__ void
            const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__))
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    GridwiseGemm::template Run<HasMainKBlockLoop>(

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -344,7 +344,7 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
    template <class FloatC>
    __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)
    {
-#if defined(__gfx90a__) || defined(__gfx940__)
+#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
        reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
            reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
 #else

--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -12,7 +12,7 @@ cmake
 -save-temps=$PWD"                                                                                 \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942"                                               \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
   target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
   target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
@@ -14,4 +17,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
   add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
   target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
   target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_batched_gemm_gemm)
   add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
   target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
   add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/batched_gemm_reduce/CMakeLists.txt
+++ b/test/batched_gemm_reduce/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
   target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_batched_gemm_softmax_gemm)
   add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
   target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
   add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_batched_gemm_softmax_gemm_permute)
   add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
@@ -14,4 +17,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
   target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16)
   add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/contraction/CMakeLists.txt
+++ b/test/contraction/CMakeLists.txt
 add_gtest_executable(test_contraction test_contraction.cpp)
 target_link_libraries(test_contraction PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
    add_gtest_executable(test_contraction_interface test_contraction_interface.cpp)
    target_link_libraries(test_contraction_interface PRIVATE utility device_contraction_bilinear_instance device_contraction_scale_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/convnd_bwd_data/CMakeLists.txt
+++ b/test/convnd_bwd_data/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_gtest_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
   target_link_libraries(test_convnd_bwd_data PRIVATE utility device_conv1d_bwd_data_instance device_conv2d_bwd_data_instance device_conv3d_bwd_data_instance)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/convnd_fwd/CMakeLists.txt
+++ b/test/convnd_fwd/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_gtest_executable(test_convnd_fwd convnd_fwd.cpp)
   target_link_libraries(test_convnd_fwd PRIVATE utility device_conv2d_fwd_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/gemm_layernorm/CMakeLists.txt
+++ b/test/gemm_layernorm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_gemm_layernorm)
   add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp)
   target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance)
   add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/gemm_split_k/CMakeLists.txt
+++ b/test/gemm_split_k/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_gtest_executable(test_gemm_splitk test_gemm_splitk.cpp)
   target_link_libraries(test_gemm_splitk PRIVATE utility device_gemm_splitk_instance)
-endif()
+   set(target 1)
+ endif()
+endforeach()
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_gtest_executable(test_grouped_convnd_bwd_weight grouped_convnd_bwd_weight.cpp)
-    target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
+   target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
-endif()
+   set(target 1)
\ No newline at end of file
+ endif()
+endforeach()
\ No newline at end of file
--- a/test/grouped_gemm/CMakeLists.txt
+++ b/test/grouped_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
   add_custom_target(test_grouped_gemm)
   add_gtest_executable(test_grouped_gemm_splitk test_grouped_gemm_splitk.cpp)
   add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface.cpp)
@@ -6,4 +9,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
   target_link_libraries(test_grouped_gemm_interface PRIVATE utility device_grouped_gemm_instance)
   add_dependencies(test_grouped_gemm test_grouped_gemm_splitk test_grouped_gemm_interface)
-endif()
+   set(target 1)
+ endif()
+endforeach()