Commit 3a44a469 authored by Chao Liu's avatar Chao Liu
Browse files

fix bug for miopen

parent 9d000309
......@@ -246,14 +246,14 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
return __llvm_amdgcn_raw_buffer_load_f16(
src_wave_buffer_resource.data, src_addr_shift + src_thread_data_offset, 0, 0);
src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
#else
half_t zero(0);
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_f16(
src_wave_buffer_resource.data, src_thread_data_offset, 0, 0)
src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0)
: zero;
#endif // CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
}
......@@ -356,7 +356,7 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_offset ? *reinterpret_cast<half8_t*>(&dst_out_tmp) : zeros;
return src_thread_data_valid ? *reinterpret_cast<half8_t*>(&dst_out_tmp) : zeros;
#endif
}
......@@ -385,7 +385,7 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
return __llvm_amdgcn_raw_buffer_load_bf16(
src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
#else
ushort_t zero(0);
ushort zero(0);
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
......@@ -493,7 +493,7 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_offset ? *reinterpret_cast<ushort8_t*>(&dst_out_tmp) : zeros;
return src_thread_data_valid ? *reinterpret_cast<ushort8_t*>(&dst_out_tmp) : zeros;
#endif
}
......@@ -969,9 +969,9 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
for(index_t i = 0; i < 2; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
......
#ifndef CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP
#include "config.hpp"
#include "array.hpp"
#include "container_helper.hpp"
#include "statically_indexed_array.hpp"
#include "container_element_picker.hpp"
#include "config.hpp"
#include "float_type.hpp"
#include "functional.hpp"
#include "functional2.hpp"
......@@ -25,7 +25,6 @@
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#include "amd_llvm_intrinsic.hpp"
#endif
#if CK_USE_AMD_XDLOPS
......
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP
#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#endif
#include "bfloat16_dev.hpp"
#ifndef CK_HIP_VERSION_FLAT
......@@ -78,6 +80,12 @@
#define CK_WORKAROUND_SWDEV_229564 1
#endif
// workaround for accvgpr over-allocation
#ifndef CK_WORKAROUND_SWDEV_241664
#define CK_WORKAROUND_SWDEV_241664 1
#endif
namespace ck {
enum AddressSpace
......
......@@ -21,6 +21,153 @@ typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
struct c_vec32_4_t
{
union VecType
{
struct
{
float32_t x;
float32_t y;
float32_t z;
float32_t w;
} s;
float n[128];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
c.s.y = 0;
c.s.z = 0;
c.s.w = 0;
return c;
}
};
struct c_vec32_2_t
{
union VecType
{
struct
{
float32_t x;
float32_t y;
} s;
float n[64];
} l;
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
c.s.y = 0;
return c;
}
};
struct c_vec32_2_2_t
{
union VecType
{
struct
{
c_vec32_2_t x;
c_vec32_2_t y;
} s;
float n[128];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x.l.s.x = 0;
c.s.x.l.s.y = 0;
c.s.y.l.s.x = 0;
c.s.y.l.s.y = 0;
return c;
}
};
struct c_vec32_1_t
{
union VecType
{
struct
{
float32_t x;
} s;
float n[32];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
return c;
}
};
struct c_vec16_1_t
{
union VecType
{
struct
{
float16_t x;
} s;
float n[16];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
return c;
}
};
struct c_vec4_2_t
{
union VecType
{
struct
{
float4_t x;
float4_t y;
} s;
float n[8];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
c.s.y = 0;
return c;
}
};
struct c_vec4_1_t
{
union VecType
{
struct
{
float4_t x;
} s;
float n[4];
};
__host__ __device__ static VecType CreateVecZero()
{
VecType c;
c.s.x = 0;
return c;
}
};
template <class T, index_t N>
struct vector_type
{
......
......@@ -141,7 +141,7 @@ struct AtomicAddData
}
}
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_FADD
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment