enable some instances on gfx12

8bee1e81 · illsilin · 78f637e4 · 8bee1e81 · 8bee1e81 · 8bee1e81
Commit 8bee1e81 authored Apr 16, 2024 by illsilin
6 changed files
--- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
 list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1200)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1200 gfx1201)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)

--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
 list(APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list2 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200)
+list(APPEND gpu_list2 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)

--- a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+++ b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
 list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1200)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
@@ -49,7 +49,8 @@ __global__ void
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
-    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__))
+    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

--- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
@@ -60,8 +60,7 @@ __global__ void
                                          bool input_permute,
                                          bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx11__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
-    defined(__gfx12__))
    // clang-format off
 // ***************************************************

--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -36,7 +36,8 @@ __global__ void
                                const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__))
+    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
+    defined(__gfx12__))
    GridwiseTensorRearrangeKernel::Run(in_grid_desc,
                                       p_in_global,
                                       out_grid_desc,