"vscode:/vscode.git/clone" did not exist on "5ba9ea8698c8c2847f1366139d46edfbb39d743b"
Commit 67204577 authored by illsilin's avatar illsilin
Browse files

fix clang format

parent 7567a2bd
......@@ -34,10 +34,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
return r;
}
CK_TILE_DEVICE __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(const void* ptr, uint32_t size = 0xffffffff)
CK_TILE_DEVICE __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(const void* ptr,
uint32_t size = 0xffffffff)
{
auto p = const_cast<remove_cv_t<void>*>(ptr);
auto p = const_cast<remove_cv_t<void>*>(ptr);
return __builtin_amdgcn_make_buffer_rsrc(p, 0, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD);
}
......@@ -906,12 +906,12 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
// buffer atomic-max fp64
CK_TILE_DEVICE_EXTERN double
llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
int32x4_t rsrc, // dst_wave_buffer_resource
int voffset, // dst_thread_addr_offset
int soffset, // dst_wave_addr_offset
int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64(
double vdata,
int32x4_t rsrc, // dst_wave_buffer_resource
int voffset, // dst_thread_addr_offset
int soffset, // dst_wave_addr_offset
int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
// Direct loads from global to LDS.
CK_TILE_DEVICE_EXTERN void
......@@ -977,58 +977,59 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
if constexpr(N == 1)
{
return bit_cast<rtn_type>(__builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 2)
{
int16_t tmp = __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp);
}
else if constexpr(N == 4)
{
int32_t tmp = __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp);
}
else if constexpr(N == 8)
{
int32x2_t tmp = __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp);
}
else if constexpr(N == 16)
{
int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp);
}
else if constexpr(N == 32)
{
int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
int32x4_t tmp1 =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t),
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t),
static_cast<index_t>(coherence));
thread_buffer<int32_t, 8> tmp;
tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
......@@ -1039,24 +1040,24 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
else if constexpr(N == 64)
{
int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
int32x4_t tmp1 =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t),
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t),
static_cast<index_t>(coherence));
int32x4_t tmp2 =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 8 * sizeof(int32_t),
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset + 8 * sizeof(int32_t),
static_cast<index_t>(coherence));
int32x4_t tmp3 =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 12 * sizeof(int32_t),
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset + 12 * sizeof(int32_t),
static_cast<index_t>(coherence));
thread_buffer<int32_t, 16> tmp;
......@@ -1076,9 +1077,10 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
template <typename T,
index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset)
CK_TILE_DEVICE thread_buffer<T, N>
amd_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset)
{
static_assert(
(std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
......@@ -1100,25 +1102,25 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
{
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 2)
{
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 4)
{
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 8)
{
......@@ -1126,15 +1128,15 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
tmp.template get_as<fp32x4_t>()(number<0>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<1>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float),
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float),
static_cast<index_t>(coherence));
return tmp;
}
......@@ -1144,27 +1146,27 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
tmp.template get_as<fp32x4_t>()(number<0>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<1>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float),
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float),
static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<2>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 8 * sizeof(float),
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset + 8 * sizeof(float),
static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<3>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 12 * sizeof(float),
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset + 12 * sizeof(float),
static_cast<index_t>(coherence));
return tmp;
}
......@@ -1175,33 +1177,33 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
{
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 2)
{
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 4)
{
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 8)
{
// use fp32 load to mimic fp16 load
fp32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp);
}
......@@ -1212,32 +1214,32 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
{
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 2)
{
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 4)
{
return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence)));
}
else if constexpr(N == 8)
{
int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
src_thread_addr_offset,
src_wave_addr_offset,
static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp);
}
......@@ -1352,10 +1354,11 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
template <index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data,
__amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset)
CK_TILE_DEVICE void
amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data,
__amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset)
{
static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
"wrong! not implemented");
......@@ -1363,43 +1366,43 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
if constexpr(N == 1)
{
__builtin_amdgcn_raw_buffer_store_b8(bit_cast<int8_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 2)
{
__builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 4)
{
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<int32_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 8)
{
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<int32x2_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 16)
{
__builtin_amdgcn_raw_buffer_store_b128(bit_cast<int32x4_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 32)
{
......@@ -1477,26 +1480,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if constexpr(N == 1)
{
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<float>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 2)
{
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<fp32x2_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 4)
{
__builtin_amdgcn_raw_buffer_store_b128(bit_cast<fp32x4_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 8)
{
......@@ -1519,26 +1522,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if constexpr(N == 1)
{
__builtin_amdgcn_raw_buffer_store_b16(bit_cast<_Float16>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 2)
{
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<fp16x2_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 4)
{
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<fp16x4_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 8)
{
......@@ -1558,10 +1561,10 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
static_cast<index_t>(coherence));
#else
__builtin_amdgcn_raw_buffer_store_b128(bit_cast<fp32x4_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
#endif
}
}
......@@ -1570,26 +1573,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if constexpr(N == 1)
{
__builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 2)
{
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<int16x2_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 4)
{
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<int16x4_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 8)
{
......@@ -1613,26 +1616,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if constexpr(N == 1)
{
__builtin_amdgcn_raw_buffer_store_b16(bit_cast<uint16_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 2)
{
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<uint16x2_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 4)
{
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<uint16x4_t>(src_thread_data),
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
}
else if constexpr(N == 8)
{
......@@ -2081,11 +2084,12 @@ template <typename T,
index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
bool pre_nop = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
const __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_element_offset,
index_t src_linear_element_offset,
bool_constant<pre_nop> = {})
CK_TILE_DEVICE void
amd_async_buffer_load_with_oob_raw(T* smem,
const __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_element_offset,
index_t src_linear_element_offset,
bool_constant<pre_nop> = {})
{
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
......@@ -2103,12 +2107,13 @@ template <typename T,
index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
bool oob_conditional_check = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem,
const __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_element_offset,
index_t src_linear_element_offset,
bool is_valid_element,
bool_constant<oob_conditional_check> = {})
CK_TILE_DEVICE void
amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem,
const __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_element_offset,
index_t src_linear_element_offset,
bool is_valid_element,
bool_constant<oob_conditional_check> = {})
{
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment