Commit 7a3d9697 authored by Chao Liu's avatar Chao Liu
Browse files

buffer addressing use offset trick

parent 3b07df08
...@@ -5,118 +5,118 @@ ...@@ -5,118 +5,118 @@
namespace ck { namespace ck {
// For 128bit SGPRs in buffer_load and buffer_store instructions // For 128 bit SGPRs to supply resource constant in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template <typename T> template <typename T>
union BufferAddressConfig union BufferResourceConstant
{ {
int32x4_t data; int32x4_t data;
T* address[2]; T* address[2];
int32_t range[4]; int32_t range[4];
}; };
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t rsrc, __device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.f32"); bool slc) __asm("llvm.amdgcn.buffer.load.f32");
__device__ float2_t __device__ float2_t
__llvm_amdgcn_buffer_load_f32x2(int32x4_t rsrc, __llvm_amdgcn_buffer_load_f32x2(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2f32"); bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");
__device__ float4_t __device__ float4_t
__llvm_amdgcn_buffer_load_f32x4(int32x4_t rsrc, __llvm_amdgcn_buffer_load_f32x4(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f32"); bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
__device__ half_t __llvm_amdgcn_buffer_load_f16(int32x4_t rsrc, __device__ half_t __llvm_amdgcn_buffer_load_f16(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.f16"); bool slc) __asm("llvm.amdgcn.buffer.load.f16");
__device__ half2_t __llvm_amdgcn_buffer_load_f16x2(int32x4_t rsrc, __device__ half2_t __llvm_amdgcn_buffer_load_f16x2(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2f16"); bool slc) __asm("llvm.amdgcn.buffer.load.v2f16");
__device__ half4_t __llvm_amdgcn_buffer_load_f16x4(int32x4_t rsrc, __device__ half4_t __llvm_amdgcn_buffer_load_f16x4(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f16"); bool slc) __asm("llvm.amdgcn.buffer.load.v4f16");
__device__ ushort __llvm_amdgcn_buffer_load_bf16(int32x4_t rsrc, __device__ ushort __llvm_amdgcn_buffer_load_bf16(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.bf16"); bool slc) __asm("llvm.amdgcn.buffer.load.bf16");
__device__ ushort2_t __device__ ushort2_t
__llvm_amdgcn_buffer_load_bf16x2(int32x4_t rsrc, __llvm_amdgcn_buffer_load_bf16x2(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2bf16"); bool slc) __asm("llvm.amdgcn.buffer.load.v2bf16");
__device__ ushort4_t __device__ ushort4_t
__llvm_amdgcn_buffer_load_bf16x4(int32x4_t rsrc, __llvm_amdgcn_buffer_load_bf16x4(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4bf16"); bool slc) __asm("llvm.amdgcn.buffer.load.v4bf16");
__device__ void __llvm_amdgcn_buffer_store_f32(float vdata, __device__ void __llvm_amdgcn_buffer_store_f32(float vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.f32"); bool slc) __asm("llvm.amdgcn.buffer.store.f32");
__device__ void __llvm_amdgcn_buffer_store_f32x2(float2_t vdata, __device__ void __llvm_amdgcn_buffer_store_f32x2(float2_t vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2f32"); bool slc) __asm("llvm.amdgcn.buffer.store.v2f32");
__device__ void __llvm_amdgcn_buffer_store_f32x4(float4_t vdata, __device__ void __llvm_amdgcn_buffer_store_f32x4(float4_t vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f32"); bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");
__device__ void __llvm_amdgcn_buffer_store_f16(half_t vdata, __device__ void __llvm_amdgcn_buffer_store_f16(half_t vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.f16"); bool slc) __asm("llvm.amdgcn.buffer.store.f16");
__device__ void __llvm_amdgcn_buffer_store_f16x2(half2_t vdata, __device__ void __llvm_amdgcn_buffer_store_f16x2(half2_t vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2f16"); bool slc) __asm("llvm.amdgcn.buffer.store.v2f16");
__device__ void __llvm_amdgcn_buffer_store_f16x4(half4_t vdata, __device__ void __llvm_amdgcn_buffer_store_f16x4(half4_t vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f16"); bool slc) __asm("llvm.amdgcn.buffer.store.v4f16");
__device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata, __device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
...@@ -124,7 +124,7 @@ __device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata, ...@@ -124,7 +124,7 @@ __device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata,
__device__ void __device__ void
__llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata, __llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
...@@ -132,7 +132,7 @@ __llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata, ...@@ -132,7 +132,7 @@ __llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata,
__device__ void __device__ void
__llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata, __llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
...@@ -140,781 +140,1083 @@ __llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata, ...@@ -140,781 +140,1083 @@ __llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata,
__device__ void __device__ void
__llvm_amdgcn_buffer_atomic_add_f32(float vdata, __llvm_amdgcn_buffer_atomic_add_f32(float vdata,
int32x4_t rsrc, int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32"); bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32");
// buffer_load requires: // buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr // 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
// 2) p_src to be a block-invariant pointer. // 2) p_src_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true. // It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType __device__ typename vector_type<T, VectorSize>::MemoryType
amd_buffer_load(const T* p_src_block, amd_buffer_load(const T* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid); bool src_data_valid,
index_t src_elemenst_space);
// buffer_store requires: // buffer_store requires:
// 1) p_src must be in vgpr space, d_dst must be global memory // 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst to be a block-invariant pointer. // 2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true. // It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ void amd_buffer_store(const T* p_src, __device__ void amd_buffer_store(const T* p_src_thread,
T* p_dst_block, T* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid); bool dst_data_valid,
index_t dst_data_range);
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ void amd_buffer_atomic_add(const T* p_src, __device__ void amd_buffer_atomic_add(const T* p_src_thread,
T* p_dst_block, T* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid); bool dst_data_valid,
index_t dst_data_range);
template <> template <>
__device__ float amd_buffer_load<float, 1>(const float* p_src_block, __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<float> src_block_config; BufferResourceConstant<float> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<float*>(p_src_block); src_wave_config.address[0] = const_cast<float*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(float);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32( return __llvm_amdgcn_buffer_load_f32(
src_block_config.data, src_wave_config.data,
0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32(src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false, false,
false); false);
#endif
} }
template <> template <>
__device__ float2_t amd_buffer_load<float, 2>(const float* p_src_block, __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<float> src_block_config; BufferResourceConstant<float> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<float*>(p_src_block); src_wave_config.address[0] = const_cast<float*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(float);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32x2( return __llvm_amdgcn_buffer_load_f32x2(
src_block_config.data, src_wave_config.data,
0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2(src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false, false,
false); false);
#endif
} }
template <> template <>
__device__ float4_t amd_buffer_load<float, 4>(const float* p_src_block, __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<float> src_block_config; BufferResourceConstant<float> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<float*>(p_src_block); src_wave_config.address[0] = const_cast<float*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(float);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32x4( return __llvm_amdgcn_buffer_load_f32x4(
src_block_config.data, src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false, false,
false); false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4(src_wave_config.data,
0,
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif
} }
template <> template <>
__device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_block, __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<half_t> src_block_config; BufferResourceConstant<half_t> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<half_t*>(p_src_block); src_wave_config.address[0] = const_cast<half_t*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(half_t);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_WORKAROUND_SWDEV_231101
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t); index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f16( return __llvm_amdgcn_buffer_load_f16(
src_block_config.data, src_wave_config.data,
0,
src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f16(src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false, false,
false); false);
#endif
#else #else
return src_valid ? p_src_block[src_thread_data_offset + src_const_data_offset] : 0; return src_data_valid ? p_src_wave[src_thread_data_offset + src_const_data_offset] : 0;
#endif #endif
} }
template <> template <>
__device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_block, __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<half_t> src_block_config; BufferResourceConstant<half_t> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<half_t*>(p_src_block); src_wave_config.address[0] = const_cast<half_t*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(half_t);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t); index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f16x2( float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_block_config.data, src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false, false,
false); false);
#else #else
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32( index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
src_block_config.data,
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false, false,
false); false);
#endif
return *reinterpret_cast<half2_t*>(&dst_out_tmp); return *reinterpret_cast<half2_t*>(&dst_out_tmp);
#endif
} }
template <> template <>
__device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_block, __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<half_t> src_block_config; BufferResourceConstant<half_t> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<half_t*>(p_src_block); src_wave_config.address[0] = const_cast<half_t*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(half_t);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t); index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f16x4( float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_block_config.data, src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false, false,
false); false);
#else #else
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2( index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
src_block_config.data,
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false, false,
false); false);
#endif
return *reinterpret_cast<half4_t*>(&dst_out_tmp); return *reinterpret_cast<half4_t*>(&dst_out_tmp);
#endif
} }
template <> template <>
__device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_block, __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<half_t> src_block_config; BufferResourceConstant<half_t> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<half_t*>(p_src_block); src_wave_config.address[0] = const_cast<half_t*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(half_t);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t); index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4( float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_block_config.data, src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false, false,
false); false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(src_wave_config.data,
0,
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif
return *reinterpret_cast<half8_t*>(&dst_out_tmp); return *reinterpret_cast<half8_t*>(&dst_out_tmp);
} }
template <> template <>
__device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_block, __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<ushort> src_block_config; BufferResourceConstant<ushort> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<ushort*>(p_src_block); src_wave_config.address[0] = const_cast<ushort*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(ushort);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_WORKAROUND_SWDEV_231101
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort); index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_bf16( return __llvm_amdgcn_buffer_load_bf16(
src_block_config.data, src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false, false,
false); false);
#else #else
return src_valid ? p_src_block[src_thread_data_offset + src_const_data_offset] : 0; index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_bf16(src_wave_config.data,
0,
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif
#else
return src_data_valid ? p_src_wave[src_thread_data_offset + src_const_data_offset] : 0;
#endif #endif
} }
template <> template <>
__device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_block, __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<ushort> src_block_config; BufferResourceConstant<ushort> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<ushort*>(p_src_block); src_wave_config.address[0] = const_cast<ushort*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(ushort);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort); index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_bf16x2( float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_block_config.data, src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false, false,
false); false);
#else #else
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32( index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
src_block_config.data,
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false, false,
false); false);
#endif
return *reinterpret_cast<ushort2_t*>(&dst_out_tmp); return *reinterpret_cast<ushort2_t*>(&dst_out_tmp);
#endif
} }
template <> template <>
__device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_block, __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<ushort> src_block_config; BufferResourceConstant<ushort> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<ushort*>(p_src_block); src_wave_config.address[0] = const_cast<ushort*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(ushort);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort); index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_bf16x4( float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_block_config.data, src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false, false,
false); false);
#else #else
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2( index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
src_block_config.data,
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false, false,
false); false);
#endif
return *reinterpret_cast<ushort4_t*>(&dst_out_tmp); return *reinterpret_cast<ushort4_t*>(&dst_out_tmp);
#endif
} }
template <> template <>
__device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_block, __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset, index_t src_const_data_offset,
bool src_valid) bool src_data_valid,
index_t src_data_range)
{ {
BufferAddressConfig<ushort> src_block_config; BufferResourceConstant<ushort> src_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
src_block_config.address[0] = const_cast<ushort*>(p_src_block); src_wave_config.address[0] = const_cast<ushort*>(p_src_wave);
// fill in byte 2 // wavewise range (32 bit)
src_block_config.range[2] = -1; src_wave_config.range[2] = src_data_range * sizeof(ushort);
// fill in byte 3 // wavewise setting (32 bit)
src_block_config.range[3] = 0x00027000; src_wave_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort); index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4( float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_block_config.data, src_wave_config.data,
0, 0,
src_valid ? (src_thread_addr_offset + src_const_addr_offset) : -1, src_data_valid ? (src_thread_addr_offset + src_const_addr_offset) : 0xffffffff,
false, false,
false); false);
#else
index_t src_addr_base = src_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(src_wave_config.data,
0,
src_addr_base + src_thread_addr_offset +
src_const_addr_offset,
false,
false);
#endif
return *reinterpret_cast<ushort8_t*>(&dst_out_tmp); return *reinterpret_cast<ushort8_t*>(&dst_out_tmp);
} }
template <> template <>
__device__ void amd_buffer_store<float, 1>(const float* p_src, __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
float* p_dst_block, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<float> dst_block_config; BufferResourceConstant<float> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(float);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
__llvm_amdgcn_buffer_store_f32(*p_src, #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config.data, __llvm_amdgcn_buffer_store_f32(*p_src_thread,
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset)
: -1, : 0xffffffff,
false, false,
false); false);
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_thread,
dst_wave_config.data,
0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#endif
} }
template <> template <>
__device__ void amd_buffer_store<float, 2>(const float* p_src, __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
float* p_dst_block, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<float> dst_block_config; BufferResourceConstant<float> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(float);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src), #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config.data, __llvm_amdgcn_buffer_store_f32x2(
*reinterpret_cast<const float2_t*>(p_src_thread),
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff,
: -1,
false, false,
false); false);
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
dst_wave_config.data,
0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#endif
} }
template <> template <>
__device__ void amd_buffer_store<float, 4>(const float* p_src, __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
float* p_dst_block, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<float> dst_block_config; BufferResourceConstant<float> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(float);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src), #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config.data, __llvm_amdgcn_buffer_store_f32x4(
*reinterpret_cast<const float4_t*>(p_src_thread),
dst_wave_config.data,
0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
: -1,
false, false,
false); false);
#endif
} }
template <> template <>
__device__ void amd_buffer_store<half_t, 1>(const half_t* p_src, __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
half_t* p_dst_block, half_t* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<half_t> dst_block_config; BufferResourceConstant<half_t> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(half_t);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_WORKAROUND_SWDEV_231101
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
__llvm_amdgcn_buffer_store_f16(*p_src, #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config.data, __llvm_amdgcn_buffer_store_f16(*p_src_thread,
dst_wave_config.data,
0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset)
: 0xffffffff,
false,
false);
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f16(*p_src_thread,
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
: -1,
false, false,
false); false);
#endif
#else #else
if(dst_valid) if(dst_data_valid)
{ {
p_dst_block[dst_thread_data_offset + dst_const_data_offset] = *p_src; p_dst_wave[dst_thread_data_offset + dst_const_data_offset] = *p_src_thread;
} }
#endif #endif
} }
template <> template <>
__device__ void amd_buffer_store<half_t, 2>(const half_t* p_src, __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
half_t* p_dst_block, half_t* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<half_t> dst_block_config; BufferResourceConstant<half_t> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(half_t);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101 const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
__llvm_amdgcn_buffer_store_f16x2(*reinterpret_cast<const half2_t*>(p_src),
dst_block_config.data, #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset)
: -1, : 0xffffffff,
false, false,
false); false);
#else #else
const float* p_src_tmp = reinterpret_cast<const float*>(p_src); index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp, __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_block_config.data, dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
: -1,
false, false,
false); false);
#endif #endif
} }
template <> template <>
__device__ void amd_buffer_store<half_t, 4>(const half_t* p_src, __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
half_t* p_dst_block, half_t* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
BufferAddressConfig<half_t> dst_block_config; BufferResourceConstant<half_t> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(half_t);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101 const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
__llvm_amdgcn_buffer_store_f16x4(*reinterpret_cast<const half4_t*>(p_src),
dst_block_config.data, #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2(
*p_src_tmp,
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff,
: -1,
false, false,
false); false);
#else #else
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src); index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_block_config.data, dst_wave_config.data,
0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
half_t* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset,
bool dst_data_valid,
index_t dst_data_range)
{
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
BufferResourceConstant<half_t> dst_wave_config;
// wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(half_t);
// wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000;
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4(
*p_src_tmp,
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff,
: -1, false,
false);
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_config.data,
0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
false, false,
false); false);
#endif #endif
} }
template <> template <>
__device__ void amd_buffer_store<ushort, 1>(const ushort* p_src, __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
ushort* p_dst_block, ushort* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<ushort> dst_block_config; BufferResourceConstant<ushort> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(ushort);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101 #if !CK_WORKAROUND_SWDEV_231101
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
__llvm_amdgcn_buffer_store_bf16(*p_src, #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config.data, __llvm_amdgcn_buffer_store_bf16(
*p_src_thread,
dst_wave_config.data,
0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_bf16(*p_src_thread,
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
: -1,
false, false,
false); false);
#endif
#else #else
if(dst_valid) if(dst_data_valid)
{ {
p_dst_block[dst_thread_data_offset + dst_const_data_offset] = *p_src; p_dst_wave[dst_thread_data_offset + dst_const_data_offset] = *p_src_thread;
} }
#endif #endif
} }
template <> template <>
__device__ void amd_buffer_store<ushort, 2>(const ushort* p_src, __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
ushort* p_dst_block, ushort* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<ushort> dst_block_config; BufferResourceConstant<ushort> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(ushort);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101 const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
__llvm_amdgcn_buffer_store_bf16x2(*p_src,
dst_block_config.data, #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset)
: -1, : 0xffffffff,
false, false,
false); false);
#else #else
const float* p_src_tmp = reinterpret_cast<const float*>(p_src); index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp, __llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_block_config.data, dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
: -1,
false, false,
false); false);
#endif #endif
} }
template <> template <>
__device__ void amd_buffer_store<ushort, 4>(const ushort* p_src, __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
ushort* p_dst_block, ushort* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<ushort> dst_block_config; BufferResourceConstant<ushort> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(ushort);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101 const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
__llvm_amdgcn_buffer_store_bf16x4(*p_src,
dst_block_config.data, #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2(
*p_src_tmp,
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff,
: -1,
false, false,
false); false);
#else #else
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src); index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_block_config.data, dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
: -1,
false, false,
false); false);
#endif #endif
} }
template <> template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src, __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
float* p_dst_block, ushort* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<float> dst_block_config; BufferResourceConstant<ushort> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(ushort);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4(
*p_src_tmp,
dst_wave_config.data,
0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff,
false,
false);
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_config.data,
0,
dst_addr_base + dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
float* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset,
bool dst_data_valid,
index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_config;
// wavewise base address (64 bit)
dst_wave_config.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_config.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
__llvm_amdgcn_buffer_atomic_add_f32(*p_src, #if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config.data, __llvm_amdgcn_buffer_atomic_add_f32(
*p_src_thread,
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset) dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset) : 0xffffffff,
: -1,
false); false);
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
dst_wave_config.data,
0,
dst_addr_base + dst_thread_addr_offset +
dst_const_addr_offset,
false);
#endif
} }
template <> template <>
__device__ void amd_buffer_atomic_add<float, 2>(const float* p_src, __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
float* p_dst_block, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<float> dst_block_config; BufferResourceConstant<float> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range;
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for(index_t i = 0; i < 2; ++i) for(index_t i = 0; i < 2; ++i)
{ {
__llvm_amdgcn_buffer_atomic_add_f32( __llvm_amdgcn_buffer_atomic_add_f32(
p_src[i], p_src_thread[i],
dst_block_config.data, dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset + i * sizeof(float)) : -1, dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset + i * sizeof(float))
: 0xffffffff,
false); false);
} }
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 2; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
dst_wave_config.data,
0,
dst_addr_base + dst_thread_addr_offset +
dst_const_addr_offset + i * sizeof(float),
false);
}
#endif
} }
template <> template <>
__device__ void amd_buffer_atomic_add<float, 4>(const float* p_src, __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
float* p_dst_block, float* p_dst_wave,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset, index_t dst_const_data_offset,
bool dst_valid) bool dst_data_valid,
index_t dst_data_range)
{ {
BufferAddressConfig<float> dst_block_config; BufferResourceConstant<float> dst_wave_config;
// fill in byte 0 - 1 // wavewise base address (64 bit)
dst_block_config.address[0] = p_dst_block; dst_wave_config.address[0] = p_dst_wave;
// fill in byte 2 // wavewise range (32 bit)
dst_block_config.range[2] = -1; dst_wave_config.range[2] = dst_data_range * sizeof(float);
// fill in byte 3 // wavewise setting (32 bit)
dst_block_config.range[3] = 0x00027000; dst_wave_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for(index_t i = 0; i < 4; ++i) for(index_t i = 0; i < 4; ++i)
{ {
__llvm_amdgcn_buffer_atomic_add_f32( __llvm_amdgcn_buffer_atomic_add_f32(
p_src[i], p_src_thread[i],
dst_block_config.data, dst_wave_config.data,
0,
dst_data_valid ? (dst_thread_addr_offset + dst_const_addr_offset + i * sizeof(float))
: 0xffffffff,
false);
}
#else
index_t dst_addr_base = dst_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 4; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
dst_wave_config.data,
0, 0,
dst_valid ? (dst_thread_addr_offset + dst_const_addr_offset + i * sizeof(float)) : -1, dst_addr_base + dst_thread_addr_offset +
dst_const_addr_offset + i * sizeof(float),
false); false);
} }
#endif
} }
} // namespace ck } // namespace ck
......
...@@ -49,12 +49,13 @@ ...@@ -49,12 +49,13 @@
#endif #endif
// experimental implementation // experimental implementation
#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1
#endif
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0 #endif
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK #ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
......
...@@ -50,9 +50,11 @@ struct SetData ...@@ -50,9 +50,11 @@ struct SetData
__device__ void Run(const T* p_src, __device__ void Run(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t /* src_range */,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t /* dst_range */) const
{ {
if(dst_valid) if(dst_valid)
{ {
...@@ -77,14 +79,16 @@ struct SetData ...@@ -77,14 +79,16 @@ struct SetData
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src, __device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t src_range,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t /* dst_range */) const
{ {
if(dst_valid) if(dst_valid)
{ {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid); amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid, src_range);
} }
} }
...@@ -96,14 +100,16 @@ struct SetData ...@@ -96,14 +100,16 @@ struct SetData
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src, __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t /* src_range */,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t dst_range) const
{ {
const auto zeros = vector_t(0); const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>( amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid); src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid, dst_range);
} }
#endif #endif
}; };
...@@ -118,9 +124,11 @@ struct AtomicAddData ...@@ -118,9 +124,11 @@ struct AtomicAddData
__device__ void Run(const T* p_src, __device__ void Run(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t /* src_range */,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t /* dst_range */) const
{ {
if(src_valid && dst_valid) if(src_valid && dst_valid)
{ {
...@@ -137,14 +145,20 @@ struct AtomicAddData ...@@ -137,14 +145,20 @@ struct AtomicAddData
template <> template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src, __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset, index_t src_offset,
index_t /* src_range */,
bool src_valid T* p_dst, bool src_valid T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) const bool dst_valid,
index_t dst_range) const
{ {
const auto zeros = vector_t(0); const auto zeros = vector_t(0);
amd_buffer_atomic_add<T, DataPerAccess>( amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid); p_dst,
dst_offset,
0,
dst_valid,
index_t dst_range);
} }
#endif #endif
}; };
...@@ -159,9 +173,11 @@ template <typename T, ...@@ -159,9 +173,11 @@ template <typename T,
__device__ void transfer_data(const T* p_src, __device__ void transfer_data(const T* p_src,
index_t src_offset, index_t src_offset,
bool src_valid, bool src_valid,
index_t src_range,
T* p_dst, T* p_dst,
index_t dst_offset, index_t dst_offset,
bool dst_valid) bool dst_valid,
index_t dst_range)
{ {
static_assert(DstInMemOp == InMemoryDataOperation::Set || static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd, DstInMemOp == InMemoryDataOperation::AtomicAdd,
...@@ -173,12 +189,12 @@ __device__ void transfer_data(const T* p_src, ...@@ -173,12 +189,12 @@ __device__ void transfer_data(const T* p_src,
// TODO: use static_if::ElseIf // TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>( SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid); p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
}); });
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>( AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid); p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
}); });
} }
else else
...@@ -191,9 +207,11 @@ __device__ void transfer_data(const T* p_src, ...@@ -191,9 +207,11 @@ __device__ void transfer_data(const T* p_src,
p_src, p_src,
src_offset + i * SrcDataStride, src_offset + i * SrcDataStride,
src_valid, src_valid,
src_range,
p_dst, p_dst,
dst_offset + i * DstDataStride, dst_offset + i * DstDataStride,
dst_valid); dst_valid,
dst_range);
}); });
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
...@@ -201,9 +219,11 @@ __device__ void transfer_data(const T* p_src, ...@@ -201,9 +219,11 @@ __device__ void transfer_data(const T* p_src,
p_src, p_src,
src_offset + i * SrcDataStride, src_offset + i * SrcDataStride,
src_valid, src_valid,
src_range,
p_dst, p_dst,
dst_offset + i * DstDataStride, dst_offset + i * DstDataStride,
dst_valid); dst_valid,
dst_range);
}); });
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment