Commit 079b745d authored by Chao Liu's avatar Chao Liu
Browse files

refactor buffer load/store

parent f6ec737c
......@@ -178,7 +178,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
dst_vector(i) = p_src[Number<src_offset>{}];
});
#if 1
amd_buffer_store_v2<DstData, DstScalarPerVector>(
dst_vector.Vector(),
p_dst,
......@@ -186,17 +185,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_coord_),
dst_desc.GetElementSpaceSize());
#else
static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
amd_buffer_store_v2<DstData, 1>(
dst_vector[i],
p_dst,
dst_slice_origin_coord_.GetOffset() + i.value,
coordinate_has_valid_offset_assuming_visible_index_is_valid(
dst_desc, dst_slice_origin_coord_),
dst_desc.GetElementSpaceSize());
});
#endif
constexpr auto move_on_dim = [&]() constexpr
{
......@@ -480,16 +468,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
if constexpr(SrcAddressSpace == AddressSpace::Global)
{
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src,
src_slice_origin_coord_.GetOffset(),
true,
src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_coord_);
src_vector.Vector() = is_valid ? src_vector.Vector() : src_vector_t{0};
src_vector.Vector() = amd_buffer_load_v2<SrcData, SrcScalarPerVector>(
p_src,
src_slice_origin_coord_.GetOffset(),
is_valid,
src_desc.GetElementSpaceSize());
}
else
{
......@@ -816,30 +802,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
using src_vector_t = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
#if 1
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src, src_slice_origin_coord_.GetOffset(), true, src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_coord_);
src_vector.Vector() = is_valid ? src_vector.Vector() : src_vector_t{0};
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
buffer_(Number<buffer_offset>{}) = src_vector[i];
});
#else
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_coord_);
src_vector.Vector() =
amd_buffer_load<SrcData, SrcScalarPerVector>(p_src,
src_slice_origin_coord_.GetOffset(),
is_valid,
src_desc.GetElementSpaceSize());
amd_buffer_load_v2<SrcData, SrcScalarPerVector>(p_src,
src_slice_origin_coord_.GetOffset(),
is_valid,
src_desc.GetElementSpaceSize());
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset =
......@@ -847,7 +817,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
buffer_(Number<buffer_offset>{}) = src_vector[i];
});
#endif
constexpr auto move_on_dim = [&]() constexpr
{
......
......@@ -118,18 +118,6 @@ __device__ void amd_buffer_store(const T* p_src_thread,
bool dst_thread_data_valid,
index_t dst_data_range);
// buffer_store requires:
// 1) p_dst_wave must be global memory
// 2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ void
amd_buffer_store_v2(const typename vector_type<T, VectorSize>::MemoryType src_thread_data,
T* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range);
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
......@@ -158,21 +146,16 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
#if 0 // debug
float tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float(0);
#else
return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
#endif
#endif
}
......@@ -193,21 +176,16 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
#if 0
float2_t tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float2_t(0);
#else
return __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
#endif
#endif
}
......@@ -228,21 +206,16 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
#if 0
float4_t tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float4_t(0);
#else
return __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
#endif
#endif
}
......@@ -263,7 +236,7 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
......@@ -298,7 +271,7 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
......@@ -332,7 +305,7 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
......@@ -366,7 +339,7 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
......@@ -400,7 +373,7 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
......@@ -435,7 +408,7 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
......@@ -469,7 +442,7 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
......@@ -503,7 +476,7 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
......@@ -538,7 +511,7 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_thread,
......@@ -574,7 +547,7 @@ __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
......@@ -614,7 +587,7 @@ __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
......@@ -654,7 +627,7 @@ __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
......@@ -695,7 +668,7 @@ __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
......@@ -733,7 +706,7 @@ __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
......@@ -771,7 +744,7 @@ __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
......@@ -807,7 +780,7 @@ __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_raw_buffer_store_bf16(*p_src_thread,
......@@ -844,7 +817,7 @@ __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
......@@ -882,7 +855,7 @@ __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
......@@ -920,7 +893,7 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
......@@ -938,126 +911,6 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
#if CK_USE_AMD_BUFFER_ATOMIC_FADD
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
......@@ -1077,7 +930,7 @@ __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
......@@ -1112,7 +965,7 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 2; ++i)
......@@ -1157,7 +1010,7 @@ __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 4; ++i)
......
#ifndef CK_AMD_BUFFER_ADDRESSING_V2_HPP
#define CK_AMD_BUFFER_ADDRESSING_V2_HPP
#include "float_type.hpp"
#include "amd_buffer_addressing.hpp"
namespace ck {
#if 0
// For 128 bit SGPRs to supply resource constant in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template <typename T>
union BufferResourceConstant
{
int32x4_t data;
T* address[2];
int32_t range[4];
int32_t config[4];
};
#endif
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.f32");
__device__ float2_t
__llvm_amdgcn_buffer_load_f32x2(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");
__device__ float4_t
__llvm_amdgcn_buffer_load_f32x4(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
// buffer_load requires:
// 1) p_src_wave must be in global memory space
// 2) p_src_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType
amd_buffer_load_v2(const T* p_src_wave,
index_t src_thread_data_offset,
bool src_thread_data_valid,
index_t src_elemenst_space);
// buffer_store requires:
// 1) p_dst_wave must be global memory
// 2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ void
amd_buffer_store_v2(const typename vector_type<T, VectorSize>::MemoryType src_thread_data,
T* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range);
template <>
__device__ float amd_buffer_load_v2<float, 1>(const float* p_src_wave,
index_t src_thread_data_offset,
bool src_thread_data_valid,
index_t src_data_range)
{
BufferResourceConstant<float> src_wave_buffer_resource;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
float tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float(0);
#endif
}
template <>
__device__ float2_t amd_buffer_load_v2<float, 2>(const float* p_src_wave,
index_t src_thread_data_offset,
bool src_thread_data_valid,
index_t src_data_range)
{
BufferResourceConstant<float> src_wave_buffer_resource;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
float2_t tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float2_t(0);
#endif
}
template <>
__device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave,
index_t src_thread_data_offset,
bool src_thread_data_valid,
index_t src_data_range)
{
BufferResourceConstant<float> src_wave_buffer_resource;
// wavewise base address (64 bit)
src_wave_buffer_resource.address[0] = const_cast<float*>(p_src_wave);
// wavewise range (32 bit)
src_wave_buffer_resource.range[2] = src_data_range * sizeof(float);
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
float4_t tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float4_t(0);
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
} // namespace ck
#endif
......@@ -58,16 +58,16 @@
#endif
// experimental implementation
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK 0
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
#endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK 1
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK 1
#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
#endif
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
......
......@@ -5,6 +5,7 @@
#if CK_USE_AMD_BUFFER_ADDRESSING
#include "amd_buffer_addressing.hpp"
#include "amd_buffer_addressing_v2.hpp"
#endif
namespace ck {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment