Unverified Commit ac62d13e authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Improve buffer address for out of bound check (#21)

* Use buffer load built-in OOB check. buffer size is limited to 2GB.
* buffer APIs use combined wave and thread offset
* use uint32_t for addr shift in buffer addressing
parent 5c7cec11
......@@ -110,14 +110,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
constexpr auto in_gemmm_gemmn_global_desc = transform_tensor_descriptor(
constexpr auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
// output tensor
constexpr auto out_gemmk_gemmn_global_desc =
constexpr auto out_gemmm_gemmn_global_desc =
transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
......@@ -130,8 +130,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
Float,
AccFloat,
decltype(wei_gemmk_gemmm_global_desc),
decltype(in_gemmm_gemmn_global_desc),
decltype(out_gemmk_gemmn_global_desc),
decltype(in_gemmk_gemmn_global_desc),
decltype(out_gemmm_gemmn_global_desc),
InMemoryDataOperation::Set,
GemmMPerBlock,
GemmNPerBlock,
......
......@@ -84,46 +84,23 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer) const
{
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseLoad.Run_optimized_src_address_calculation(p_block_src, p_thread_buffer);
}
else
{
mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
}
}
}
template <typename ThreadBufferData, typename BlockDstData>
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst) const
{
constexpr bool has_optimized_address_calculation =
decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseStore.Run_optimized_dst_address_calculation(p_thread_buffer,
p_block_dst);
}
else
{
mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
}
}
}
template <typename BlockSrcData, typename BlockDstData>
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
......
......@@ -93,11 +93,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// buffer to hold a src long-vector
SrcData p_src_long_vector[long_vector_size];
#if 1
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
#endif
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
......@@ -112,17 +114,20 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
}
1>(p_src,
src_coord.GetOffset(),
src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
SrcDesc::GetElementSpace(),
p_src_long_vector,
buffer_offset,
true,
long_vector_size);
}
// SrcData to DstData conversion
......@@ -146,334 +151,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Check dst data's valid mapping situation, only check the first data in this dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>(
p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset());
}
}
});
}
// Modify Length to 1, if Mask is set to false
// Used for isolating linear dimension from non-linear dimensions
template <index_t... Lengths, index_t... Mask>
__device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
{
return Sequence<(Mask ? Lengths : 1)...>{};
}
// Will do valid mapping check on src data: Read 0 if src data has a invalid mapping
// Will do valid mapping check on dst data: No write if dst data has a invalid mapping
// This version is optimized for address calculation of src tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData, typename DstData>
__device__ void Run_optimized_src_address_calculation(const SrcData* p_src,
DstData* p_dst) const
{
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerRead>{};
constexpr auto dst_data_per_access = Number<DstDataPerWrite>{};
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerRead, DstDataPerWrite)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
// separate linear dimensions from non-linear dimensions
constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask();
constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
static_assert(
src_linear_dim_mask.At(SrcDstVectorReadWriteDim) || long_vector_size == SrcDataPerRead,
"Warning! SrcDstVectorReadWriteDim is not SrcDesc's linear dimension, performance "
"would drop");
// separate steps into linear and non-linear components, accoording to src tensor
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
constexpr auto nonlinear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
// loop over src's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
auto nonlinear_dim_long_vector_access_id) {
// calculate step-sizes along src's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// move src cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
// loop over src's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
// step-sizes along src's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
SrcData p_src_long_vector[long_vector_size];
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
// Loop over SrcDstVectorReadWriteDim, and load data from src to the
// long-vector buffer.
// If SrcDstVectorReadWriteDim is src's linear dimension, then src's
// offset-diff due to this looping is known at compile-time. If
// SrcDstVectorReadWriteDim is src's nonlinear dimension, then src's
// offset-diff due to this looping is only known at run-time. For best
// performance, SrcDstVectorReadWriteDim, should be src's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t buffer_offset = i * src_data_per_access;
// move src cooridnate along linear dimensions
const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is src compile-time offset
const index_t src_linear_offset =
src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else
// this is src compile-time offset
const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#endif
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(p_src,
src_nonlinear_coord.GetOffset() +
src_linear_offset,
p_src_long_vector,
buffer_offset);
}
}
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
// dst offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp>(
p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset());
}
}
});
});
}
// This version is optimized for address calculation of dst tensor
// TODO: this function is not compiled to expected ISA
template <typename SrcData, typename DstData>
__device__ void Run_optimized_dst_address_calculation(const SrcData* p_src,
DstData* p_dst) const
{
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerRead>{};
constexpr auto dst_data_per_access = Number<DstDataPerWrite>{};
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerRead, DstDataPerWrite)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
// separate linear dimensions from non-linear dimensions
constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask();
constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
static_assert(
dst_linear_dim_mask.At(SrcDstVectorReadWriteDim) || long_vector_size == DstDataPerWrite,
"Warning! SrcDstVectorReadWriteDim is not DstDesc's linear dimension, performance "
"would drop");
// separate steps into linear and non-linear components, accoording to dst tensor
constexpr auto linear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_linear_dim_mask);
constexpr auto nonlinear_long_vector_access_lengths =
mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask);
// loop over dst's non-linear dimensions
ford<decltype(nonlinear_long_vector_access_lengths)>{}([&](
auto nonlinear_dim_long_vector_access_id) {
// calculate step-sizes along dst's nonlinear dimensions
auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
nonlinear_dim_data_steps(vector_access_dim) =
long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
// move dst cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps;
// loop over dst's linear dimensions
ford<decltype(linear_long_vector_access_lengths)>{}([&](
auto linear_dim_long_vector_access_id) {
// step-sizes along dst's linear dimensions
auto linear_dim_data_steps = linear_dim_long_vector_access_id;
linear_dim_data_steps(vector_access_dim) =
long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
// buffer to hold a long-vector
SrcData p_src_long_vector[long_vector_size];
// zero out buffer
for(index_t i = 0; i < long_vector_size; ++i)
{
p_src_long_vector[i] = 0;
}
// Loop over SrcDstVectorReadWriteDim, and load data from src to the
// long-vector buffer.
// If SrcDstVectorReadWriteDim is dst's linear dimension, then dst's
// offset-diff due to this looping is known at compile-time. If
// SrcDstVectorReadWriteDim is dst's nonlinear dimension, then dst's
// offset-diff due to this looping is only known at run-time. For best
// performance, SrcDstVectorReadWriteDim, should be dst's linear dimension
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t buffer_offset = i * src_data_per_access;
// src offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps +
linear_dim_data_steps + scalar_id);
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
}
}
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
for(index_t i = 0; i < long_vector_size; ++i)
{
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
}
// store data from the long-vector buffer to dst
for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
// move dst cooridnate along linear dimensions
const auto dst_coord =
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is dst compile-time offset
const index_t dst_linear_offset =
dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
#else
// this is dst compile-time offset
const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
#endif
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp>(p_dst_long_vector,
DstDataStride>(p_dst_long_vector,
buffer_offset,
true,
long_vector_size,
p_dst,
dst_nonlinear_coord.GetOffset() +
dst_linear_offset);
}
dst_coord.GetOffset(),
dst_coord.IsOffsetValidAssumingUpperIndexIsValid(),
DstDesc::GetElementSpace());
}
});
});
}
__device__ static constexpr bool HasWorkingOptimizedAddressCalculation()
{
#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
return true;
#else
return false;
#endif
}
template <typename T, bool PositiveDirection>
......
......@@ -5,118 +5,119 @@
namespace ck {
// For 128bit SGPRs in buffer_load and buffer_store instructions
// For 128 bit SGPRs to supply resource constant in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template <typename T>
union BufferAddressConfig
union BufferResourceConstant
{
int32x4_t data;
T* address[2];
int32_t range[4];
int32_t config[4];
};
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t rsrc,
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.f32");
__device__ float2_t
__llvm_amdgcn_buffer_load_f32x2(int32x4_t rsrc,
__llvm_amdgcn_buffer_load_f32x2(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");
__device__ float4_t
__llvm_amdgcn_buffer_load_f32x4(int32x4_t rsrc,
__llvm_amdgcn_buffer_load_f32x4(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
__device__ half_t __llvm_amdgcn_buffer_load_f16(int32x4_t rsrc,
__device__ half_t __llvm_amdgcn_buffer_load_f16(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.f16");
__device__ half2_t __llvm_amdgcn_buffer_load_f16x2(int32x4_t rsrc,
__device__ half2_t __llvm_amdgcn_buffer_load_f16x2(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2f16");
__device__ half4_t __llvm_amdgcn_buffer_load_f16x4(int32x4_t rsrc,
__device__ half4_t __llvm_amdgcn_buffer_load_f16x4(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f16");
__device__ ushort __llvm_amdgcn_buffer_load_bf16(int32x4_t rsrc,
__device__ ushort __llvm_amdgcn_buffer_load_bf16(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.bf16");
__device__ ushort2_t
__llvm_amdgcn_buffer_load_bf16x2(int32x4_t rsrc,
__llvm_amdgcn_buffer_load_bf16x2(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2bf16");
__device__ ushort4_t
__llvm_amdgcn_buffer_load_bf16x4(int32x4_t rsrc,
__llvm_amdgcn_buffer_load_bf16x4(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4bf16");
__device__ void __llvm_amdgcn_buffer_store_f32(float vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.f32");
__device__ void __llvm_amdgcn_buffer_store_f32x2(float2_t vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2f32");
__device__ void __llvm_amdgcn_buffer_store_f32x4(float4_t vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");
__device__ void __llvm_amdgcn_buffer_store_f16(half_t vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.f16");
__device__ void __llvm_amdgcn_buffer_store_f16x2(half2_t vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2f16");
__device__ void __llvm_amdgcn_buffer_store_f16x4(half4_t vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f16");
__device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
......@@ -124,7 +125,7 @@ __device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata,
__device__ void
__llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
......@@ -132,7 +133,7 @@ __llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata,
__device__ void
__llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
......@@ -140,646 +141,986 @@ __llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata,
__device__ void
__llvm_amdgcn_buffer_atomic_add_f32(float vdata,
int32x4_t rsrc,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32");
// buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer.
// 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
// 2) p_src_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType amd_buffer_load(
const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
__device__ typename vector_type<T, VectorSize>::MemoryType
amd_buffer_load(const T* p_src_wave,
index_t src_thread_data_offset,
bool src_thread_data_valid,
index_t src_elemenst_space);
// buffer_store requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ void amd_buffer_store(const T* p_src,
T* p_dst_block,
__device__ void amd_buffer_store(const T* p_src_thread,
T* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset);
bool dst_thread_data_valid,
index_t dst_data_range);
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ void amd_buffer_atomic_add(const T* p_src,
T* p_dst_block,
__device__ void amd_buffer_atomic_add(const T* p_src_thread,
T* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset);
bool dst_thread_data_valid,
index_t dst_data_range);
template <>
__device__ float amd_buffer_load<float, 1>(const float* p_src_block,
__device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<float> src_block_config;
BufferResourceConstant<float> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
#if 1 // debug
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
#else
return src_thread_data_valid
? __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
: 0;
#endif
}
template <>
__device__ float2_t amd_buffer_load<float, 2>(const float* p_src_block,
__device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<float> src_block_config;
BufferResourceConstant<float> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
}
template <>
__device__ float4_t amd_buffer_load<float, 4>(const float* p_src_block,
__device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<float> src_block_config;
BufferResourceConstant<float> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
}
template <>
__device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_block,
__device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<half_t> src_block_config;
BufferResourceConstant<half_t> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<half_t*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f16(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f16(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
#else
return p_src_block[src_thread_data_offset + src_const_data_offset];
return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0;
#endif
}
template <>
__device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_block,
__device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<half_t> src_block_config;
BufferResourceConstant<half_t> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<half_t*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101
return __llvm_amdgcn_buffer_load_f16x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float dst_out_tmp =
__llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<half2_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_block,
__device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<half_t> src_block_config;
BufferResourceConstant<half_t> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<half_t*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101
return __llvm_amdgcn_buffer_load_f16x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float2_t dst_out_tmp =
__llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<half4_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_block,
__device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<half_t> src_block_config;
BufferResourceConstant<half_t> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<half_t*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<half_t*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101
static_assert(false, "wrong! not supported");
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t dst_out_tmp =
__llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<half8_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_block,
__device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<ushort> src_block_config;
BufferResourceConstant<ushort> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<ushort*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_bf16(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_bf16(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
#else
return p_src_block[src_thread_data_offset + src_const_data_offset];
return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0;
#endif
}
template <>
__device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_block,
__device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<ushort> src_block_config;
BufferResourceConstant<ushort> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<ushort*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101
return __llvm_amdgcn_buffer_load_bf16x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float dst_out_tmp =
__llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<ushort2_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_block,
__device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<ushort> src_block_config;
BufferResourceConstant<ushort> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<ushort*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101
return __llvm_amdgcn_buffer_load_bf16x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float2_t dst_out_tmp =
__llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<ushort4_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_block,
__device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
index_t src_thread_data_offset,
index_t src_const_data_offset)
bool src_thread_data_valid,
index_t src_data_range)
{
BufferAddressConfig<ushort> src_block_config;
BufferResourceConstant<ushort> src_wave_buffer_resource;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<ushort*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<ushort*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101
static_assert(false, "wrong! not implemented");
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t dst_out_tmp =
__llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<ushort8_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ void amd_buffer_store<float, 1>(const float* p_src,
float* p_dst_block,
__device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
float* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferAddressConfig<float> dst_block_config;
BufferResourceConstant<float> dst_wave_buffer_resource;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
__llvm_amdgcn_buffer_store_f32(*p_src,
dst_block_config.data,
#if 1 // debug
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32(
*p_src_thread, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
}
#endif
}
template <>
__device__ void amd_buffer_store<float, 2>(const float* p_src,
float* p_dst_block,
__device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
float* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferAddressConfig<float> dst_block_config;
BufferResourceConstant<float> dst_wave_buffer_resource;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src),
dst_block_config.data,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_store<float, 4>(const float* p_src,
float* p_dst_block,
__device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
float* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferAddressConfig<float> dst_block_config;
BufferResourceConstant<float> dst_wave_buffer_resource;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src),
dst_block_config.data,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_store<half_t, 1>(const half_t* p_src,
half_t* p_dst_block,
__device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
half_t* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferAddressConfig<half_t> dst_block_config;
BufferResourceConstant<half_t> dst_wave_buffer_resource;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
__llvm_amdgcn_buffer_store_f16(*p_src,
dst_block_config.data,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f16(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
p_dst_block[dst_thread_data_offset + dst_const_data_offset] = *p_src;
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f16(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
#else
if(dst_thread_data_valid)
{
p_dst_wave[dst_thread_data_offset] = *p_src_thread;
}
#endif
}
template <>
__device__ void amd_buffer_store<half_t, 2>(const half_t* p_src,
half_t* p_dst_block,
__device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
half_t* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferAddressConfig<half_t> dst_block_config;
BufferResourceConstant<half_t> dst_wave_buffer_resource;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_f16x2(*reinterpret_cast<const half2_t*>(p_src),
dst_block_config.data,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
const float* p_src_tmp = reinterpret_cast<const float*>(p_src);
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_block_config.data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_store<half_t, 4>(const half_t* p_src,
half_t* p_dst_block,
__device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
half_t* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
BufferResourceConstant<half_t> dst_wave_buffer_resource;
BufferAddressConfig<half_t> dst_block_config;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_f16x4(*reinterpret_cast<const half4_t*>(p_src),
dst_block_config.data,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src);
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_block_config.data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
half_t* p_dst_wave,
index_t dst_thread_data_offset,
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferResourceConstant<half_t> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_store<ushort, 1>(const ushort* p_src,
ushort* p_dst_block,
__device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
ushort* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferAddressConfig<ushort> dst_block_config;
BufferResourceConstant<ushort> dst_wave_buffer_resource;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
__llvm_amdgcn_buffer_store_bf16(*p_src,
dst_block_config.data,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_bf16(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
p_dst_block[dst_thread_data_offset + dst_const_data_offset] = *p_src;
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_bf16(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
#else
if(dst_thread_data_valid)
{
p_dst_wave[dst_thread_data_offset] = *p_src_thread;
}
#endif
}
template <>
__device__ void amd_buffer_store<ushort, 2>(const ushort* p_src,
ushort* p_dst_block,
__device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
ushort* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferAddressConfig<ushort> dst_block_config;
BufferResourceConstant<ushort> dst_wave_buffer_resource;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_bf16x2(*p_src,
dst_block_config.data,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
const float* p_src_tmp = reinterpret_cast<const float*>(p_src);
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_block_config.data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_store<ushort, 4>(const ushort* p_src,
ushort* p_dst_block,
__device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
ushort* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferAddressConfig<ushort> dst_block_config;
BufferResourceConstant<ushort> dst_wave_buffer_resource;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_bf16x4(*p_src,
dst_block_config.data,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src);
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_block_config.data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src,
float* p_dst_block,
__device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
ushort* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferAddressConfig<float> dst_block_config;
BufferResourceConstant<ushort> dst_wave_buffer_resource;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
float* p_dst_wave,
index_t dst_thread_data_offset,
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
__llvm_amdgcn_buffer_atomic_add_f32(
*p_src, dst_block_config.data, 0, dst_thread_addr_offset + dst_const_addr_offset, false);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false);
#else
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false);
#endif
}
template <>
__device__ void amd_buffer_atomic_add<float, 2>(const float* p_src,
float* p_dst_block,
__device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
float* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range;
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for(index_t i = 0; i < 2; ++i)
{
amd_buffer_atomic_add<float, 1>(
&p_src[i], p_dst_block, dst_thread_data_offset, dst_const_data_offset + i);
__llvm_amdgcn_buffer_atomic_add_f32(
p_src_thread[i],
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff,
false);
}
#else
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 2; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset +
i * sizeof(float),
false);
}
#endif
}
template <>
__device__ void amd_buffer_atomic_add<float, 4>(const float* p_src,
float* p_dst_block,
__device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
float* p_dst_wave,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
bool dst_thread_data_valid,
index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for(index_t i = 0; i < 4; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(
p_src_thread[i],
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff,
false);
}
#else
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 4; ++i)
{
amd_buffer_atomic_add<float, 1>(
&p_src[i], p_dst_block, dst_thread_data_offset, dst_const_data_offset + i);
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset +
i * sizeof(float),
false);
}
#endif
}
} // namespace ck
......
......@@ -49,12 +49,13 @@
#endif
// experimental implementation
#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1
#endif
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
......
......@@ -47,38 +47,69 @@ struct SetData
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
__device__ void Run(const T* p_src,
index_t src_offset,
bool src_valid,
index_t /* src_range */,
T* p_dst,
index_t dst_offset,
bool dst_valid,
index_t /* dst_range */) const
{
if(dst_valid)
{
if(src_valid)
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
}
else
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = 0;
}
}
}
#if CK_USE_AMD_BUFFER_ADDRESSING
// buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer.
// 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
// 2) p_src_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
index_t src_offset,
bool src_valid,
index_t src_range,
T* p_dst,
index_t dst_offset) const
index_t dst_offset,
bool dst_valid,
index_t /* dst_range */) const
{
if(dst_valid)
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
}
}
// buffer_store requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
bool src_valid,
index_t /* src_range */,
T* p_dst,
index_t dst_offset) const
index_t dst_offset,
bool dst_valid,
index_t dst_range) const
{
amd_buffer_store<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, dst_valid, dst_range);
}
#endif
};
......@@ -90,24 +121,43 @@ struct AtomicAddData
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
__device__ void Run(const T* p_src,
index_t src_offset,
bool src_valid,
index_t /* src_range */,
T* p_dst,
index_t dst_offset,
bool dst_valid,
index_t /* dst_range */) const
{
if(src_valid && dst_valid)
{
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}
}
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
// buffer_atomic_add requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
T* p_dst,
index_t dst_offset) const
index_t /* src_range */,
bool src_valid T* p_dst,
index_t dst_offset,
bool dst_valid,
index_t dst_range) const
{
amd_buffer_atomic_add<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
const auto zeros = vector_t(0);
amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
p_dst,
dst_offset,
dst_valid,
index_t dst_range);
}
#endif
};
......@@ -119,7 +169,14 @@ template <typename T,
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride = 1,
index_t DstDataStride = 1>
__device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
__device__ void transfer_data(const T* p_src,
index_t src_offset,
bool src_valid,
index_t src_range,
T* p_dst,
index_t dst_offset,
bool dst_valid,
index_t dst_range)
{
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd,
......@@ -131,27 +188,41 @@ __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, inde
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
});
}
else
{
for(index_t i = 0; i < DataPerAccess; i++)
for(index_t i = 0; i < DataPerAccess; ++i)
{
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
p_src,
src_offset + i * SrcDataStride,
src_valid,
src_range,
p_dst,
dst_offset + i * DstDataStride,
dst_valid,
dst_range);
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
p_src,
src_offset + i * SrcDataStride,
src_valid,
src_range,
p_dst,
dst_offset + i * DstDataStride,
dst_valid,
dst_range);
});
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment