Commit 6f0ef227 authored by Chao Liu's avatar Chao Liu
Browse files

revise buffer_load oob for fp32, it will not modify offset, but it will modify...

revise buffer_load oob for fp32, it will not modify offset, but it will modify data loaded by buffer_load
parent 82bf5de2
......@@ -146,16 +146,16 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
return src_thread_data_valid
? __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
: 0;
float tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float(0);
#endif
}
......@@ -176,18 +176,16 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
float2_t zeros(0);
float2_t tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid
? __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
: zeros;
return src_thread_data_valid ? tmp : float2_t(0);
#endif
}
......@@ -208,18 +206,16 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
float4_t zeros(0);
float4_t tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid
? __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
: zeros;
return src_thread_data_valid ? tmp : float4_t(0);
#endif
}
......@@ -240,7 +236,7 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
......@@ -255,7 +251,7 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_f16(
src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0)
: zero;
#endif // CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#endif
}
template <>
......@@ -275,7 +271,7 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
......@@ -309,7 +305,7 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
......@@ -343,7 +339,7 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
......@@ -377,7 +373,7 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
......@@ -412,7 +408,7 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
......@@ -446,7 +442,7 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
......@@ -480,7 +476,7 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
......@@ -515,7 +511,7 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_thread,
......@@ -551,7 +547,7 @@ __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
......@@ -591,7 +587,7 @@ __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
......@@ -631,7 +627,7 @@ __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
......@@ -672,7 +668,7 @@ __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
......@@ -710,7 +706,7 @@ __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
......@@ -748,7 +744,7 @@ __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
......@@ -784,7 +780,7 @@ __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_raw_buffer_store_bf16(*p_src_thread,
......@@ -821,7 +817,7 @@ __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
......@@ -859,7 +855,7 @@ __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
......@@ -897,7 +893,7 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
......@@ -934,7 +930,7 @@ __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
......@@ -969,7 +965,7 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 2; ++i)
......@@ -1014,7 +1010,7 @@ __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 4; ++i)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment