Commit 67204577 authored by illsilin's avatar illsilin
Browse files

fix clang format

parent 7567a2bd
...@@ -34,10 +34,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz ...@@ -34,10 +34,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
return r; return r;
} }
CK_TILE_DEVICE __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(const void* ptr,
CK_TILE_DEVICE __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(const void* ptr, uint32_t size = 0xffffffff) uint32_t size = 0xffffffff)
{ {
auto p = const_cast<remove_cv_t<void>*>(ptr); auto p = const_cast<remove_cv_t<void>*>(ptr);
return __builtin_amdgcn_make_buffer_rsrc(p, 0, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD); return __builtin_amdgcn_make_buffer_rsrc(p, 0, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD);
} }
...@@ -906,12 +906,12 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32( ...@@ -906,12 +906,12 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32"); index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
// buffer atomic-max fp64 // buffer atomic-max fp64
CK_TILE_DEVICE_EXTERN double CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64(
llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, double vdata,
int32x4_t rsrc, // dst_wave_buffer_resource int32x4_t rsrc, // dst_wave_buffer_resource
int voffset, // dst_thread_addr_offset int voffset, // dst_thread_addr_offset
int soffset, // dst_wave_addr_offset int soffset, // dst_wave_addr_offset
int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32"); int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
// Direct loads from global to LDS. // Direct loads from global to LDS.
CK_TILE_DEVICE_EXTERN void CK_TILE_DEVICE_EXTERN void
...@@ -977,58 +977,59 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, ...@@ -977,58 +977,59 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
if constexpr(N == 1) if constexpr(N == 1)
{ {
return bit_cast<rtn_type>(__builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource, return bit_cast<rtn_type>(
src_thread_addr_offset, __builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource,
src_wave_addr_offset, src_thread_addr_offset,
static_cast<index_t>(coherence))); src_wave_addr_offset,
static_cast<index_t>(coherence)));
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
int16_t tmp = __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource, int16_t tmp = __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp); return bit_cast<rtn_type>(tmp);
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
int32_t tmp = __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource, int32_t tmp = __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp); return bit_cast<rtn_type>(tmp);
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
int32x2_t tmp = __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource, int32x2_t tmp = __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp); return bit_cast<rtn_type>(tmp);
} }
else if constexpr(N == 16) else if constexpr(N == 16)
{ {
int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp); return bit_cast<rtn_type>(tmp);
} }
else if constexpr(N == 32) else if constexpr(N == 32)
{ {
int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
int32x4_t tmp1 = int32x4_t tmp1 =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t), src_wave_addr_offset + 4 * sizeof(int32_t),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
thread_buffer<int32_t, 8> tmp; thread_buffer<int32_t, 8> tmp;
tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0; tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
...@@ -1039,24 +1040,24 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, ...@@ -1039,24 +1040,24 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
else if constexpr(N == 64) else if constexpr(N == 64)
{ {
int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
int32x4_t tmp1 = int32x4_t tmp1 =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t), src_wave_addr_offset + 4 * sizeof(int32_t),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
int32x4_t tmp2 = int32x4_t tmp2 =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 8 * sizeof(int32_t), src_wave_addr_offset + 8 * sizeof(int32_t),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
int32x4_t tmp3 = int32x4_t tmp3 =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 12 * sizeof(int32_t), src_wave_addr_offset + 12 * sizeof(int32_t),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
thread_buffer<int32_t, 16> tmp; thread_buffer<int32_t, 16> tmp;
...@@ -1076,9 +1077,10 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, ...@@ -1076,9 +1077,10 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
template <typename T, template <typename T,
index_t N, index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, CK_TILE_DEVICE thread_buffer<T, N>
index_t src_thread_addr_offset, amd_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_wave_addr_offset) index_t src_thread_addr_offset,
index_t src_wave_addr_offset)
{ {
static_assert( static_assert(
(std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) || (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
...@@ -1100,25 +1102,25 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s ...@@ -1100,25 +1102,25 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
...@@ -1126,15 +1128,15 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s ...@@ -1126,15 +1128,15 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
tmp.template get_as<fp32x4_t>()(number<0>{}) = tmp.template get_as<fp32x4_t>()(number<0>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<1>{}) = tmp.template get_as<fp32x4_t>()(number<1>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float), src_wave_addr_offset + 4 * sizeof(float),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
return tmp; return tmp;
} }
...@@ -1144,27 +1146,27 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s ...@@ -1144,27 +1146,27 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
tmp.template get_as<fp32x4_t>()(number<0>{}) = tmp.template get_as<fp32x4_t>()(number<0>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<1>{}) = tmp.template get_as<fp32x4_t>()(number<1>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float), src_wave_addr_offset + 4 * sizeof(float),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<2>{}) = tmp.template get_as<fp32x4_t>()(number<2>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 8 * sizeof(float), src_wave_addr_offset + 8 * sizeof(float),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<3>{}) = tmp.template get_as<fp32x4_t>()(number<3>{}) =
__builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 12 * sizeof(float), src_wave_addr_offset + 12 * sizeof(float),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
return tmp; return tmp;
} }
...@@ -1175,33 +1177,33 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s ...@@ -1175,33 +1177,33 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
// use fp32 load to mimic fp16 load // use fp32 load to mimic fp16 load
fp32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, fp32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp); return bit_cast<rtn_type>(tmp);
} }
...@@ -1212,32 +1214,32 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s ...@@ -1212,32 +1214,32 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
__builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource, int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
return bit_cast<rtn_type>(tmp); return bit_cast<rtn_type>(tmp);
} }
...@@ -1352,10 +1354,11 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem, ...@@ -1352,10 +1354,11 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
template <index_t N, template <index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data, CK_TILE_DEVICE void
__amdgpu_buffer_rsrc_t dst_wave_buffer_resource, amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data,
index_t dst_thread_addr_offset, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
index_t dst_wave_addr_offset) index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset)
{ {
static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64, static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
"wrong! not implemented"); "wrong! not implemented");
...@@ -1363,43 +1366,43 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, ...@@ -1363,43 +1366,43 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
if constexpr(N == 1) if constexpr(N == 1)
{ {
__builtin_amdgcn_raw_buffer_store_b8(bit_cast<int8_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b8(bit_cast<int8_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
__builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<int32_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<int32_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<int32x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<int32x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 16) else if constexpr(N == 16)
{ {
__builtin_amdgcn_raw_buffer_store_b128(bit_cast<int32x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b128(bit_cast<int32x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 32) else if constexpr(N == 32)
{ {
...@@ -1477,26 +1480,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1477,26 +1480,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if constexpr(N == 1) if constexpr(N == 1)
{ {
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<float>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<float>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<fp32x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<fp32x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
__builtin_amdgcn_raw_buffer_store_b128(bit_cast<fp32x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b128(bit_cast<fp32x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
...@@ -1519,26 +1522,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1519,26 +1522,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if constexpr(N == 1) if constexpr(N == 1)
{ {
__builtin_amdgcn_raw_buffer_store_b16(bit_cast<_Float16>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b16(bit_cast<_Float16>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<fp16x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<fp16x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<fp16x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<fp16x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
...@@ -1558,10 +1561,10 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1558,10 +1561,10 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
#else #else
__builtin_amdgcn_raw_buffer_store_b128(bit_cast<fp32x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b128(bit_cast<fp32x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
#endif #endif
} }
} }
...@@ -1570,26 +1573,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1570,26 +1573,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if constexpr(N == 1) if constexpr(N == 1)
{ {
__builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<int16x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<int16x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<int16x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<int16x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
...@@ -1613,26 +1616,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1613,26 +1616,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if constexpr(N == 1) if constexpr(N == 1)
{ {
__builtin_amdgcn_raw_buffer_store_b16(bit_cast<uint16_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b16(bit_cast<uint16_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
__builtin_amdgcn_raw_buffer_store_b32(bit_cast<uint16x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<uint16x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
__builtin_amdgcn_raw_buffer_store_b64(bit_cast<uint16x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<uint16x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
...@@ -2081,11 +2084,12 @@ template <typename T, ...@@ -2081,11 +2084,12 @@ template <typename T,
index_t N, index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
bool pre_nop = false> bool pre_nop = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem, CK_TILE_DEVICE void
const __amdgpu_buffer_rsrc_t src_wave_buffer_resource, amd_async_buffer_load_with_oob_raw(T* smem,
index_t src_thread_element_offset, const __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_linear_element_offset, index_t src_thread_element_offset,
bool_constant<pre_nop> = {}) index_t src_linear_element_offset,
bool_constant<pre_nop> = {})
{ {
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T); index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
...@@ -2103,12 +2107,13 @@ template <typename T, ...@@ -2103,12 +2107,13 @@ template <typename T,
index_t N, index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
bool oob_conditional_check = false> bool oob_conditional_check = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem, CK_TILE_DEVICE void
const __amdgpu_buffer_rsrc_t src_wave_buffer_resource, amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem,
index_t src_thread_element_offset, const __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_linear_element_offset, index_t src_thread_element_offset,
bool is_valid_element, index_t src_linear_element_offset,
bool_constant<oob_conditional_check> = {}) bool is_valid_element,
bool_constant<oob_conditional_check> = {})
{ {
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T); index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment