"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "e61588cda60f2cd1927c56548b5ef47c0519e147"
Commit 6f0ef227 authored by Chao Liu's avatar Chao Liu
Browse files

revise buffer_load oob for fp32, it will not modify offset, but it will modify...

revise buffer_load oob for fp32, it will not modify offset, but it will modify data loaded by buffer_load
parent 82bf5de2
...@@ -146,16 +146,16 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave, ...@@ -146,16 +146,16 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32( return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else #else
return src_thread_data_valid float tmp = __llvm_amdgcn_buffer_load_f32(
? __llvm_amdgcn_buffer_load_f32( src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
: 0; return src_thread_data_valid ? tmp : float(0);
#endif #endif
} }
...@@ -176,18 +176,16 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave, ...@@ -176,18 +176,16 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2( return __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else #else
float2_t zeros(0); float2_t tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid return src_thread_data_valid ? tmp : float2_t(0);
? __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
: zeros;
#endif #endif
} }
...@@ -208,18 +206,16 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave, ...@@ -208,18 +206,16 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4( return __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else #else
float4_t zeros(0); float4_t tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid return src_thread_data_valid ? tmp : float4_t(0);
? __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
: zeros;
#endif #endif
} }
...@@ -240,7 +236,7 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave, ...@@ -240,7 +236,7 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
...@@ -255,7 +251,7 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave, ...@@ -255,7 +251,7 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_f16( return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_f16(
src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0) src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0)
: zero; : zero;
#endif // CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #endif
} }
template <> template <>
...@@ -275,7 +271,7 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave, ...@@ -275,7 +271,7 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32( float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
...@@ -309,7 +305,7 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave, ...@@ -309,7 +305,7 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2( float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
...@@ -343,7 +339,7 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave, ...@@ -343,7 +339,7 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4( float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
...@@ -377,7 +373,7 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave, ...@@ -377,7 +373,7 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
...@@ -412,7 +408,7 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave, ...@@ -412,7 +408,7 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32( float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
...@@ -446,7 +442,7 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave, ...@@ -446,7 +442,7 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2( float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
...@@ -480,7 +476,7 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave, ...@@ -480,7 +476,7 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4( float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
...@@ -515,7 +511,7 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread, ...@@ -515,7 +511,7 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_thread, __llvm_amdgcn_buffer_store_f32(*p_src_thread,
...@@ -551,7 +547,7 @@ __device__ void amd_buffer_store<float, 2>(const float* p_src_thread, ...@@ -551,7 +547,7 @@ __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread), __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
...@@ -591,7 +587,7 @@ __device__ void amd_buffer_store<float, 4>(const float* p_src_thread, ...@@ -591,7 +587,7 @@ __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread), __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
...@@ -631,7 +627,7 @@ __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread, ...@@ -631,7 +627,7 @@ __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and // current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
...@@ -672,7 +668,7 @@ __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread, ...@@ -672,7 +668,7 @@ __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread); const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp, __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
...@@ -710,7 +706,7 @@ __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread, ...@@ -710,7 +706,7 @@ __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread); const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
...@@ -748,7 +744,7 @@ __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread, ...@@ -748,7 +744,7 @@ __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread); const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
...@@ -784,7 +780,7 @@ __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread, ...@@ -784,7 +780,7 @@ __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_raw_buffer_store_bf16(*p_src_thread, __llvm_amdgcn_raw_buffer_store_bf16(*p_src_thread,
...@@ -821,7 +817,7 @@ __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread, ...@@ -821,7 +817,7 @@ __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread); const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp, __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
...@@ -859,7 +855,7 @@ __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread, ...@@ -859,7 +855,7 @@ __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread); const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
...@@ -897,7 +893,7 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread, ...@@ -897,7 +893,7 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread); const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
...@@ -934,7 +930,7 @@ __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread, ...@@ -934,7 +930,7 @@ __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread, __llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
...@@ -969,7 +965,7 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread, ...@@ -969,7 +965,7 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 2; ++i) for(index_t i = 0; i < 2; ++i)
...@@ -1014,7 +1010,7 @@ __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread, ...@@ -1014,7 +1010,7 @@ __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK #if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 4; ++i) for(index_t i = 0; i < 4; ++i)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment