Add s_nops after v_dot to avoid hazard

df4cc03f · Bartlomiej · Bartlomiej Kocot · e7dca79d · df4cc03f · df4cc03f
Commit df4cc03f authored Jul 24, 2023 by Bartlomiej Committed by Bartlomiej Kocot Jul 27, 2023
3 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
@@ -70,10 +70,9 @@ __global__ void
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
            const Block2CTileMap block_2_ctile_map)
 {
-// TODO: Enable for gfx90a after complier fix
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||             \
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||              \
+    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
-    defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || defined(__gfx1101__) || \
+    defined(__gfx1101__) || defined(__gfx1102__))
-    defined(__gfx1102__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -650,10 +649,10 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD<ALayout,
    static bool IsSupportedArgument(const Argument& arg)
    {
        // TODO: Enable for gfx90a after complier fix
-        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx908" ||
+        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx90a" ||
-           ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx940" ||
+           ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx1030" ||
-           ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" ||
+           ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx1100" ||
-           ck::get_device_name() == "gfx1102")
+           ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102")
        {
            bool pass = true;
            pass      = pass && arg.K_ % K1 == 0;

--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
@@ -75,16 +75,17 @@ inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, f
 template <>
 __device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
 {
-#if defined(CK_USE_AMD_V_DOT2_F32_F16)
+// builtin is disabled because it does not generate s_nop
-#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+// and this can lead to hazards
+#if defined(CK_USE_AMD_V_DOT2_F32_F16) && CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
    asm volatile("\n \
            v_dot2_f32_f16 %0, %1, %2, %0\n \
+            s_nop 2 \n \
            "
                 : "=v"(c)
                 : "v"(a), "v"(b), "0"(c));
-#else
    c = __builtin_amdgcn_sdot2(a, b, c, false);
-#endif
 #else
    const vector_type<half_t, 2> a_vector{a};
    const vector_type<half_t, 2> b_vector{b};
@@ -162,16 +163,16 @@ template <>
 __device__ void
 inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
 {
-#if defined(CK_USE_AMD_V_DOT4_I32_I8)
+// builtin is disabled because it does not generate s_nop
-#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+// and this can lead to hazards
+#if defined(CK_USE_AMD_V_DOT4_I32_I8) && CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
    asm volatile("\n \
            v_dot4_i32_i8 %0, %1, %2, %0\n \
+            s_nop 2 \n \
            "
                 : "=v"(c)
                 : "v"(bit_cast<int32_t>(a)), "v"(bit_cast<int32_t>(b)), "0"(c));
-#else
-    c = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b), c, false);
-#endif
 #else
    const vector_type<int8_t, 4> a_vector{a};
    const vector_type<int8_t, 4> b_vector{b};

--- a/test/batched_gemm_multi_d/CMakeLists.txt
+++ b/test/batched_gemm_multi_d/CMakeLists.txt
 # TODO: Enable for gfx90a after complier fix
 if(DL_KERNELS)
-  if(NOT GPU_TARGETS MATCHES "gfx90a")
+  add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
-    add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
+  target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
-    target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
-  endif()
 endif()