fix bug for miopen

3a44a469 · Chao Liu · 9d000309 · 3a44a469 · 3a44a469 · 3a44a469
Commit 3a44a469 authored Oct 15, 2020 by Chao Liu
5 changed files
--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -246,14 +246,14 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return __llvm_amdgcn_raw_buffer_load_f16(
-        src_wave_buffer_resource.data, src_addr_shift + src_thread_data_offset, 0, 0);
+        src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
 #else
    half_t zero(0);

    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
    return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_f16(
-                                       src_wave_buffer_resource.data, src_thread_data_offset, 0, 0)
+                                       src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0)
                                 : zero;
 #endif // CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
 }
@@ -356,7 +356,7 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

-    return src_thread_data_offset ? *reinterpret_cast<half8_t*>(&dst_out_tmp) : zeros;
+    return src_thread_data_valid ? *reinterpret_cast<half8_t*>(&dst_out_tmp) : zeros;
 #endif
 }

@@ -385,7 +385,7 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
    return __llvm_amdgcn_raw_buffer_load_bf16(
        src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
 #else
-    ushort_t zero(0);
+    ushort zero(0);

    // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
    // everything is passed to Voffset
@@ -493,7 +493,7 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
    float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
        src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);

-    return src_thread_data_offset ? *reinterpret_cast<ushort8_t*>(&dst_out_tmp) : zeros;
+    return src_thread_data_valid ? *reinterpret_cast<ushort8_t*>(&dst_out_tmp) : zeros;
 #endif
 }

@@ -969,9 +969,9 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,

    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);

+#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;

-#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
    for(index_t i = 0; i < 2; ++i)
    {
        __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],

--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
 #ifndef CK_COMMON_HEADER_HPP
 #define CK_COMMON_HEADER_HPP

+#include "config.hpp"
 #include "array.hpp"
 #include "container_helper.hpp"
 #include "statically_indexed_array.hpp"
 #include "container_element_picker.hpp"
-#include "config.hpp"
 #include "float_type.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
@@ -25,7 +25,6 @@

 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
-#include "amd_llvm_intrinsic.hpp"
 #endif

 #if CK_USE_AMD_XDLOPS

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
 #ifndef CK_CONFIG_AMD_HPP
 #define CK_CONFIG_AMD_HPP

+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
+#endif
 #include "bfloat16_dev.hpp"

 #ifndef CK_HIP_VERSION_FLAT
@@ -78,6 +80,12 @@
 #define CK_WORKAROUND_SWDEV_229564 1
 #endif

+// workaround for accvgpr over-allocation
+#ifndef CK_WORKAROUND_SWDEV_241664
+#define CK_WORKAROUND_SWDEV_241664 1
+#endif
+
+
 namespace ck {

 enum AddressSpace

--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -21,6 +21,153 @@ typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
 typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
 typedef ushort ushort8_t __attribute__((ext_vector_type(8)));

+struct c_vec32_4_t
+{
+    union VecType
+    {
+        struct
+        {
+            float32_t x;
+            float32_t y;
+            float32_t z;
+            float32_t w;
+        } s;
+        float n[128];
+    };
+
+    __host__ __device__ static VecType CreateVecZero()
+    {
+        VecType c;
+        c.s.x = 0;
+        c.s.y = 0;
+        c.s.z = 0;
+        c.s.w = 0;
+        return c;
+    }
+};
+
+struct c_vec32_2_t
+{
+    union VecType
+    {
+        struct
+        {
+            float32_t x;
+            float32_t y;
+        } s;
+        float n[64];
+    } l;
+
+    __host__ __device__ static VecType CreateVecZero()
+    {
+        VecType c;
+        c.s.x = 0;
+        c.s.y = 0;
+        return c;
+    }
+};
+
+struct c_vec32_2_2_t
+{
+    union VecType
+    {
+        struct
+        {
+            c_vec32_2_t x;
+            c_vec32_2_t y;
+        } s;
+        float n[128];
+    };
+
+    __host__ __device__ static VecType CreateVecZero()
+    {
+        VecType c;
+        c.s.x.l.s.x = 0;
+        c.s.x.l.s.y = 0;
+        c.s.y.l.s.x = 0;
+        c.s.y.l.s.y = 0;
+        return c;
+    }
+};
+
+struct c_vec32_1_t
+{
+    union VecType
+    {
+        struct
+        {
+            float32_t x;
+        } s;
+        float n[32];
+    };
+
+    __host__ __device__ static VecType CreateVecZero()
+    {
+        VecType c;
+        c.s.x = 0;
+        return c;
+    }
+};
+
+struct c_vec16_1_t
+{
+    union VecType
+    {
+        struct
+        {
+            float16_t x;
+        } s;
+        float n[16];
+    };
+
+    __host__ __device__ static VecType CreateVecZero()
+    {
+        VecType c;
+        c.s.x = 0;
+        return c;
+    }
+};
+
+struct c_vec4_2_t
+{
+    union VecType
+    {
+        struct
+        {
+            float4_t x;
+            float4_t y;
+        } s;
+        float n[8];
+    };
+
+    __host__ __device__ static VecType CreateVecZero()
+    {
+        VecType c;
+        c.s.x = 0;
+        c.s.y = 0;
+        return c;
+    }
+};
+
+struct c_vec4_1_t
+{
+    union VecType
+    {
+        struct
+        {
+            float4_t x;
+        } s;
+        float n[4];
+    };
+
+    __host__ __device__ static VecType CreateVecZero()
+    {
+        VecType c;
+        c.s.x = 0;
+        return c;
+    }
+};
+
 template <class T, index_t N>
 struct vector_type
 {

--- a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
@@ -141,7 +141,7 @@ struct AtomicAddData
        }
    }

-#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
+#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_FADD
    // buffer_atomic requires:
    //   1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
    //   2) p_dst_thread to be a wavewise pointer.