Merge pull request #51 from ROCm/lwpck-1010

Additional Navi4x enablement

Merge pull request #51 from ROCm/lwpck-1010
Additional Navi4x enablement
7e147c64 · Illia Silin · GitHub · 9fa379ea · e7e224d3 · 7e147c64
Unverified Commit 7e147c64 authored Mar 12, 2024 by Illia Silin Committed by GitHub Mar 12, 2024
14 changed files
--- a/example/02_gemm_bilinear/CMakeLists.txt
+++ b/example/02_gemm_bilinear/CMakeLists.txt
@@ -6,7 +6,7 @@ foreach(gpu IN LISTS GPU_TARGETS)
    add_example_executable(example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp)
    add_example_executable(example_gemm_bilinear_wmma_int8 gemm_bilinear_wmma_int8.cpp)
 endif()
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx94")
   set(target 1)
 endif()
 endforeach()

--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
 list(APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list2 gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list2 gfx1100 gfx1101 gfx1102 gfx1103)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)

--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -104,13 +104,6 @@
 #define CK_USE_AMD_MFMA_GFX940
 #endif
-// WMMA instruction
-#ifndef __HIP_DEVICE_COMPILE__ // for host code
-#define CK_USE_AMD_WMMA
-#elif defined(__gfx11__) || defined(__gfx12__) // for GPU code
-#define CK_USE_AMD_WMMA
-#endif
 // buffer load
 #define CK_USE_AMD_BUFFER_LOAD 1

--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -85,6 +85,9 @@ inline bool is_navi3_supported()
           ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103";
 }
-inline bool is_navi4_supported() { return ck::get_device_name() == "gfx1200"; }
+inline bool is_navi4_supported()
+{
+    return ck::get_device_name() == "gfx1200" || ck::get_device_name() == "gfx1201";
+}
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -488,7 +488,14 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
            // sync point.
            if constexpr(k.value != 0 || KPerInnerLoop == KPerThread)
            {
+#ifdef __gfx12__
+                asm volatile("\
+	        s_barrier_signal -1 \n \
+		s_barrier_wait -1 \
+		" ::);
+#else
                asm volatile("s_barrier" ::);
+#endif
                __builtin_amdgcn_sched_barrier(0);
            }
            static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
@@ -70,8 +70,9 @@ __global__ void
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
            const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
-    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__))
+    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
+    defined(__gfx12__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -648,7 +649,7 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD<ALayout,
    static bool IsSupportedArgument(const Argument& arg)
    {
        if(ck::get_device_name() == "gfx906" || ck::is_xdl_supported() ||
-           ck::is_navi2_supported() || ck::is_navi3_supported())
+           ck::is_navi2_supported() || ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            bool pass = true;
            pass      = pass && arg.K_ % K1 == 0;

--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
@@ -1394,7 +1394,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Dl
    {
        // check device
        if(!(ck::get_device_name() == "gfx906" || ck::is_navi2_supported() ||
-             ck::is_navi3_supported()))
+             ck::is_navi3_supported() || ck::is_navi4_supported()))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -50,8 +50,9 @@ __global__ void
            const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
            const Block2CTileMap block_2_ctile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
-    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__))
+    defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
+    defined(__gfx12__))
    constexpr index_t shared_block_size =
        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
@@ -552,7 +553,7 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
    static bool IsSupportedArgument(const Argument& arg)
    {
        if(ck::get_device_name() == "gfx906" || ck::is_xdl_supported() ||
-           ck::is_navi2_supported() || ck::is_navi3_supported())
+           ck::is_navi2_supported() || ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            return GridwiseGemm::CheckValidity(
                arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -90,8 +90,9 @@ __global__ void
            const Block2CTileMap block_2_ctile_map,
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) ||         \
-    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__))
+    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__) || \
+    defined(__gfx12__))
    // offset base pointer for each work-group
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -666,7 +667,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
        // check device
        if(!(ck::get_device_name() == "gfx906" || ck::is_xdl_supported() ||
-             ck::is_navi2_supported() || ck::is_navi3_supported()))
+             ck::is_navi2_supported() || ck::is_navi3_supported() || ck::is_navi4_supported()))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
@@ -107,7 +107,7 @@ __global__ void
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
-    defined(__gfx11__))
+    defined(__gfx11__) || defined(__gfx12__))
    // offset base pointer for each work-group
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -602,7 +602,7 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
        // check device
        if(!(ck::get_device_name() == "gfx906" || ck::is_navi2_supported() ||
-             ck::is_navi3_supported()))
+             ck::is_navi3_supported() || ck::is_navi4_supported()))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -39,8 +39,9 @@ __global__ void
                                          const BElementwiseOperation b_element_op,
                                          const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||         \
-    defined(__gfx90a__) || defined(__gfx103__) || defined(__gfx11__) || defined(__gfx94__))
+    defined(__gfx90a__) || defined(__gfx103__) || defined(__gfx11__) || defined(__gfx94__) || \
+    defined(__gfx12__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id = get_block_1d_id();
@@ -668,7 +669,7 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
        }
        if(ck::get_device_name() == "gfx906" || ck::is_xdl_supported() ||
-           ck::is_navi2_supported() || ck::is_navi3_supported())
+           ck::is_navi2_supported() || ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            for(std::size_t i = 0; i < arg.gemm_desc_kernel_arg_.size(); i++)
            {

--- a/include/ck/utility/synchronization.hpp
+++ b/include/ck/utility/synchronization.hpp
@@ -12,9 +12,9 @@ __device__ void block_sync_lds()
 #if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
 #ifdef __gfx12__
    asm volatile("\
-	s_wait_dscnt 0x0 \n \
+    s_wait_dscnt 0x0 \n \
-	s_barrier_signal -1 \n \
+    s_barrier_signal -1 \n \
-	s_barrier_wait -1 \
+    s_barrier_wait -1 \
    " ::);
 #else
    asm volatile("\
@@ -29,11 +29,20 @@ __device__ void block_sync_lds()
 __device__ void block_sync_lds_direct_load()
 {
+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_vmcnt 0x0 \n \
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    asm volatile("\
    s_waitcnt vmcnt(0) \n \
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \
    " ::);
+#endif
 }
 __device__ void s_nop()

--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -16,4 +16,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
        target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
        set(target 1)
    endif()
 endforeach()
\ No newline at end of file
--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -55,7 +55,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
            }
        }
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            // on navi3x only support for 3d is implemented
            if constexpr(NDimSpatial{} != 3)