Commit 435f5f91 authored by Chao Liu's avatar Chao Liu
Browse files

buffer APIs use combined wave and thread offset

parent 7a3d9697
...@@ -13,6 +13,7 @@ union BufferResourceConstant ...@@ -13,6 +13,7 @@ union BufferResourceConstant
int32x4_t data; int32x4_t data;
T* address[2]; T* address[2];
int32_t range[4]; int32_t range[4];
int32_t config[4];
}; };
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc, __device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc,
...@@ -153,8 +154,7 @@ template <typename T, index_t VectorSize> ...@@ -153,8 +154,7 @@ template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType __device__ typename vector_type<T, VectorSize>::MemoryType
amd_buffer_load(const T* p_src_wave, amd_buffer_load(const T* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_elemenst_space); index_t src_elemenst_space);
// buffer_store requires: // buffer_store requires:
...@@ -165,8 +165,7 @@ template <typename T, index_t VectorSize> ...@@ -165,8 +165,7 @@ template <typename T, index_t VectorSize>
__device__ void amd_buffer_store(const T* p_src_thread, __device__ void amd_buffer_store(const T* p_src_thread,
T* p_dst_wave, T* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range); index_t dst_data_range);
// buffer_atomic requires: // buffer_atomic requires:
...@@ -177,201 +176,170 @@ template <typename T, index_t VectorSize> ...@@ -177,201 +176,170 @@ template <typename T, index_t VectorSize>
__device__ void amd_buffer_atomic_add(const T* p_src_thread, __device__ void amd_buffer_atomic_add(const T* p_src_thread,
T* p_dst_wave, T* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range); index_t dst_data_range);
template <> template <>
__device__ float amd_buffer_load<float, 1>(const float* p_src_wave, __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<float> src_wave_config; BufferResourceConstant<float> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<float*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(float); src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32( return __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
src_wave_config.data,
0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32(src_wave_config.data,
0, 0,
src_addr_base + src_thread_addr_offset + src_thread_data_valid ? src_thread_addr_offset
src_const_addr_offset, : 0xffffffff,
false, false,
false); false);
#else
index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
#endif #endif
} }
template <> template <>
__device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave, __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<float> src_wave_config; BufferResourceConstant<float> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<float*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(float); src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32x2( return __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
src_wave_config.data,
0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2(src_wave_config.data,
0, 0,
src_addr_base + src_thread_addr_offset + src_thread_data_valid ? src_thread_addr_offset
src_const_addr_offset, : 0xffffffff,
false, false,
false); false);
#else
index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
#endif #endif
} }
template <> template <>
__device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave, __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<float> src_wave_config; BufferResourceConstant<float> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<float*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(float); src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32x4( return __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
src_wave_config.data,
0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4(src_wave_config.data,
0, 0,
src_addr_base + src_thread_addr_offset + src_thread_data_valid ? src_thread_addr_offset
src_const_addr_offset, : 0xffffffff,
false, false,
false); false);
#else
index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
#endif #endif
} }
template <> template <>
__device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave, __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<half_t> src_wave_config; BufferResourceConstant<half_t> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<half_t*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(half_t); src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_WORKAROUND_SWDEV_231101
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f16( return __llvm_amdgcn_buffer_load_f16(src_wave_buffer_resource.data,
src_wave_config.data,
0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f16(src_wave_config.data,
0, 0,
src_addr_base + src_thread_addr_offset + src_thread_data_valid ? src_thread_addr_offset
src_const_addr_offset, : 0xffffffff,
false, false,
false); false);
#else
index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f16(
src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
#endif #endif
#else #else
return src_data_valid ? p_src_wave[src_thread_data_offset + src_const_data_offset] : 0; return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0;
#endif #endif
} }
template <> template <>
__device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave, __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<half_t> src_wave_config; BufferResourceConstant<half_t> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<half_t*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(half_t); src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32( float dst_out_tmp =
src_wave_config.data, __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
0, 0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff, src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false, false,
false); false);
#else #else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff; index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(src_wave_config.data, float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
0, src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif #endif
return *reinterpret_cast<half2_t*>(&dst_out_tmp); return *reinterpret_cast<half2_t*>(&dst_out_tmp);
...@@ -380,38 +348,32 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave, ...@@ -380,38 +348,32 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
template <> template <>
__device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave, __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<half_t> src_wave_config; BufferResourceConstant<half_t> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<half_t*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(half_t); src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2( float2_t dst_out_tmp =
src_wave_config.data, __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
0, 0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff, src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false, false,
false); false);
#else #else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff; index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(src_wave_config.data, float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
0, src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif #endif
return *reinterpret_cast<half4_t*>(&dst_out_tmp); return *reinterpret_cast<half4_t*>(&dst_out_tmp);
...@@ -420,38 +382,32 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave, ...@@ -420,38 +382,32 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
template <> template <>
__device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave, __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<half_t> src_wave_config; BufferResourceConstant<half_t> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<half_t*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(half_t); src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4( float4_t dst_out_tmp =
src_wave_config.data, __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
0, 0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff, src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false, false,
false); false);
#else #else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff; index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(src_wave_config.data, float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
0, src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif #endif
return *reinterpret_cast<half8_t*>(&dst_out_tmp); return *reinterpret_cast<half8_t*>(&dst_out_tmp);
...@@ -460,81 +416,69 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave, ...@@ -460,81 +416,69 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
template <> template <>
__device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave, __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<ushort> src_wave_config; BufferResourceConstant<ushort> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<ushort*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(ushort); src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_WORKAROUND_SWDEV_231101
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_bf16( return __llvm_amdgcn_buffer_load_bf16(src_wave_buffer_resource.data,
src_wave_config.data,
0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_bf16(src_wave_config.data,
0, 0,
src_addr_base + src_thread_addr_offset + src_thread_data_valid ? src_thread_addr_offset
src_const_addr_offset, : 0xffffffff,
false, false,
false); false);
#else
index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_bf16(
src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
#endif #endif
#else #else
return src_data_valid ? p_src_wave[src_thread_data_offset + src_const_data_offset] : 0; return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0;
#endif #endif
} }
template <> template <>
__device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave, __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<ushort> src_wave_config; BufferResourceConstant<ushort> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<ushort*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(ushort); src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32( float dst_out_tmp =
src_wave_config.data, __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
0, 0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff, src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false, false,
false); false);
#else #else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff; index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(src_wave_config.data, float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
0, src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif #endif
return *reinterpret_cast<ushort2_t*>(&dst_out_tmp); return *reinterpret_cast<ushort2_t*>(&dst_out_tmp);
...@@ -543,38 +487,32 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave, ...@@ -543,38 +487,32 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
template <> template <>
__device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave, __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<ushort> src_wave_config; BufferResourceConstant<ushort> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<ushort*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(ushort); src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2( float2_t dst_out_tmp =
src_wave_config.data, __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
0, 0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff, src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false, false,
false); false);
#else #else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff; index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(src_wave_config.data, float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
0, src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif #endif
return *reinterpret_cast<ushort4_t*>(&dst_out_tmp); return *reinterpret_cast<ushort4_t*>(&dst_out_tmp);
...@@ -583,38 +521,32 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave, ...@@ -583,38 +521,32 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
template <> template <>
__device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave, __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, bool src_thread_data_valid,
bool src_data_valid,
index_t src_data_range) index_t src_data_range)
{ {
BufferResourceConstant<ushort> src_wave_config; BufferResourceConstant<ushort> src_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
src_wave_config.address[0] = const_cast<ushort*>(p_src_wave); src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
src_wave_config.range[2] = src_data_range * sizeof(ushort); src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
// wavewise setting (32 bit) // wavewise setting (32 bit)
src_wave_config.range[3] = 0x00027000; src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4( float4_t dst_out_tmp =
src_wave_config.data, __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
0, 0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff, src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false, false,
false); false);
#else #else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff; index_t src_addr_base = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(src_wave_config.data, float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
0, src_wave_buffer_resource.data, 0, src_addr_base + src_thread_addr_offset, false, false);
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif #endif
return *reinterpret_cast<ushort8_t*>(&dst_out_tmp); return *reinterpret_cast<ushort8_t*>(&dst_out_tmp);
...@@ -624,37 +556,34 @@ template <> ...@@ -624,37 +556,34 @@ template <>
__device__ void amd_buffer_store<float, 1>(const float* p_src_thread, __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
float* p_dst_wave, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<float> dst_wave_config; BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(float); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_thread, __llvm_amdgcn_buffer_store_f32(*p_src_thread,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
: 0xffffffff,
false, false,
false); false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_thread, __llvm_amdgcn_buffer_store_f32(*p_src_thread,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
...@@ -664,37 +593,34 @@ template <> ...@@ -664,37 +593,34 @@ template <>
__device__ void amd_buffer_store<float, 2>(const float* p_src_thread, __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
float* p_dst_wave, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<float> dst_wave_config; BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(float); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2( __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
*reinterpret_cast<const float2_t*>(p_src_thread), dst_wave_buffer_resource.data,
dst_wave_config.data, 0,
0, dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff, false,
false, false);
false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread), __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
...@@ -704,37 +630,34 @@ template <> ...@@ -704,37 +630,34 @@ template <>
__device__ void amd_buffer_store<float, 4>(const float* p_src_thread, __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
float* p_dst_wave, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<float> dst_wave_config; BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(float); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4( __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
*reinterpret_cast<const float4_t*>(p_src_thread), dst_wave_buffer_resource.data,
dst_wave_config.data, 0,
0, dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff, false,
false, false);
false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread), __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
...@@ -744,46 +667,43 @@ template <> ...@@ -744,46 +667,43 @@ template <>
__device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread, __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
half_t* p_dst_wave, half_t* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<half_t> dst_wave_config; BufferResourceConstant<half_t> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(half_t); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_WORKAROUND_SWDEV_231101
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f16(*p_src_thread, __llvm_amdgcn_buffer_store_f16(*p_src_thread,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
: 0xffffffff,
false, false,
false); false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f16(*p_src_thread, __llvm_amdgcn_buffer_store_f16(*p_src_thread,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
#else #else
if(dst_data_valid) if(dst_thread_data_valid)
{ {
p_dst_wave[dst_thread_data_offset + dst_const_data_offset] = *p_src_thread; p_dst_wave[dst_thread_data_offset] = *p_src_thread;
} }
#endif #endif
} }
...@@ -792,39 +712,36 @@ template <> ...@@ -792,39 +712,36 @@ template <>
__device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread, __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
half_t* p_dst_wave, half_t* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<half_t> dst_wave_config; BufferResourceConstant<half_t> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(half_t); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread); const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_tmp, __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
: 0xffffffff,
false, false,
false); false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp, __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
...@@ -834,39 +751,38 @@ template <> ...@@ -834,39 +751,38 @@ template <>
__device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread, __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
half_t* p_dst_wave, half_t* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); BufferResourceConstant<half_t> dst_wave_buffer_resource;
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
BufferResourceConstant<half_t> dst_wave_config;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(half_t); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread); const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2( __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
*p_src_tmp, dst_wave_buffer_resource.data,
dst_wave_config.data, 0,
0, dst_thread_data_valid
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff, ? dst_thread_addr_offset,
false, : 0xffffffff,
false); false,
false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
...@@ -876,39 +792,36 @@ template <> ...@@ -876,39 +792,36 @@ template <>
__device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread, __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
half_t* p_dst_wave, half_t* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); BufferResourceConstant<half_t> dst_wave_buffer_resource;
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
BufferResourceConstant<half_t> dst_wave_config;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(half_t); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread); const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4( __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
*p_src_tmp, dst_wave_buffer_resource.data,
dst_wave_config.data, 0,
0, dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff, false,
false, false);
false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
...@@ -918,46 +831,43 @@ template <> ...@@ -918,46 +831,43 @@ template <>
__device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread, __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
ushort* p_dst_wave, ushort* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<ushort> dst_wave_config; BufferResourceConstant<ushort> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(ushort); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_WORKAROUND_SWDEV_231101
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_bf16( __llvm_amdgcn_buffer_store_bf16(*p_src_thread,
*p_src_thread, dst_wave_buffer_resource.data,
dst_wave_config.data, 0,
0, dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff, false,
false, false);
false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_bf16(*p_src_thread, __llvm_amdgcn_buffer_store_bf16(*p_src_thread,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
#else #else
if(dst_data_valid) if(dst_thread_data_valid)
{ {
p_dst_wave[dst_thread_data_offset + dst_const_data_offset] = *p_src_thread; p_dst_wave[dst_thread_data_offset] = *p_src_thread;
} }
#endif #endif
} }
...@@ -966,39 +876,36 @@ template <> ...@@ -966,39 +876,36 @@ template <>
__device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread, __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
ushort* p_dst_wave, ushort* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<ushort> dst_wave_config; BufferResourceConstant<ushort> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(ushort); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread); const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_tmp, __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
: 0xffffffff,
false, false,
false); false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp, __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
...@@ -1008,39 +915,36 @@ template <> ...@@ -1008,39 +915,36 @@ template <>
__device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread, __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
ushort* p_dst_wave, ushort* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<ushort> dst_wave_config; BufferResourceConstant<ushort> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(ushort); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread); const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2( __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
*p_src_tmp, dst_wave_buffer_resource.data,
dst_wave_config.data, 0,
0, dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff, false,
false, false);
false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
...@@ -1050,39 +954,36 @@ template <> ...@@ -1050,39 +954,36 @@ template <>
__device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread, __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
ushort* p_dst_wave, ushort* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<ushort> dst_wave_config; BufferResourceConstant<ushort> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(ushort); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread); const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4( __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
*p_src_tmp, dst_wave_buffer_resource.data,
dst_wave_config.data, 0,
0, dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff, false,
false, false);
false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset, dst_addr_base + dst_thread_addr_offset,
false, false,
false); false);
#endif #endif
...@@ -1092,37 +993,33 @@ template <> ...@@ -1092,37 +993,33 @@ template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread, __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
float* p_dst_wave, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<float> dst_wave_config; BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(float); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_atomic_add_f32( __llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
*p_src_thread, dst_wave_buffer_resource.data,
dst_wave_config.data, 0,
0, dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff, false);
false);
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread, __llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_addr_base + dst_thread_addr_offset,
dst_const_addr_offset,
false); false);
#endif #endif
} }
...@@ -1131,43 +1028,40 @@ template <> ...@@ -1131,43 +1028,40 @@ template <>
__device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread, __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
float* p_dst_wave, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<float> dst_wave_config; BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range; dst_wave_buffer_resource.range[2] = dst_data_range;
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for(index_t i = 0; i < 2; ++i) for(index_t i = 0; i < 2; ++i)
{ {
__llvm_amdgcn_buffer_atomic_add_f32( __llvm_amdgcn_buffer_atomic_add_f32(
p_src_thread[i], p_src_thread[i],
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset + i * sizeof(float)) dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff,
: 0xffffffff,
false); false);
} }
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 2; ++i) for(index_t i = 0; i < 2; ++i)
{ {
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i], __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_addr_base + dst_thread_addr_offset +
dst_const_addr_offset + i * sizeof(float), i * sizeof(float),
false); false);
} }
#endif #endif
...@@ -1177,43 +1071,40 @@ template <> ...@@ -1177,43 +1071,40 @@ template <>
__device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread, __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
float* p_dst_wave, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, bool dst_thread_data_valid,
bool dst_data_valid,
index_t dst_data_range) index_t dst_data_range)
{ {
BufferResourceConstant<float> dst_wave_config; BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave; dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit) // wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(float); dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit) // wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000; dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for(index_t i = 0; i < 4; ++i) for(index_t i = 0; i < 4; ++i)
{ {
__llvm_amdgcn_buffer_atomic_add_f32( __llvm_amdgcn_buffer_atomic_add_f32(
p_src_thread[i], p_src_thread[i],
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset + i * sizeof(float)) dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff,
: 0xffffffff,
false); false);
} }
#else #else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff; index_t dst_addr_base = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 4; ++i) for(index_t i = 0; i < 4; ++i)
{ {
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i], __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
dst_wave_config.data, dst_wave_buffer_resource.data,
0, 0,
dst_addr_base + dst_thread_addr_offset + dst_addr_base + dst_thread_addr_offset +
dst_const_addr_offset + i * sizeof(float), i * sizeof(float),
false); false);
} }
#endif #endif
......
...@@ -72,8 +72,8 @@ struct SetData ...@@ -72,8 +72,8 @@ struct SetData
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
// buffer_load requires: // buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr // 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
// 2) p_src to be a block-invariant pointer. // 2) p_src_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true. // It is user's responsibility to make sure that is true.
template <> template <>
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src, __device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
...@@ -88,13 +88,13 @@ struct SetData ...@@ -88,13 +88,13 @@ struct SetData
if(dst_valid) if(dst_valid)
{ {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid, src_range); amd_buffer_load<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
} }
} }
// buffer_store requires: // buffer_store requires:
// 1) p_src must be in vgpr space, d_dst must be global memory // 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst to be a block-invariant pointer. // 2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true. // It is user's responsibility to make sure that is true.
template <> template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src, __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
...@@ -109,7 +109,7 @@ struct SetData ...@@ -109,7 +109,7 @@ struct SetData
const auto zeros = vector_t(0); const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>( amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid, dst_range); src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, dst_valid, dst_range);
} }
#endif #endif
}; };
...@@ -138,9 +138,9 @@ struct AtomicAddData ...@@ -138,9 +138,9 @@ struct AtomicAddData
} }
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD #if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
// buffer_atomic_add requires: // buffer_atomic requires:
// 1) p_src must be in vgpr space, d_dst must be global memory // 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst to be a block-invariant pointer. // 2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true. // It is user's responsibility to make sure that is true.
template <> template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src, __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
...@@ -156,7 +156,6 @@ struct AtomicAddData ...@@ -156,7 +156,6 @@ struct AtomicAddData
amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros, amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
p_dst, p_dst,
dst_offset, dst_offset,
0,
dst_valid, dst_valid,
index_t dst_range); index_t dst_range);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment