Commit 7567a2bd authored by illsilin's avatar illsilin
Browse files

replace buffer load/store intrinsics with built-ins in ck_tile

parent 36eb9b69
...@@ -34,6 +34,13 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz ...@@ -34,6 +34,13 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
return r; return r;
} }
CK_TILE_DEVICE __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(const void* ptr, uint32_t size = 0xffffffff)
{
auto p = const_cast<remove_cv_t<void>*>(ptr);
return __builtin_amdgcn_make_buffer_rsrc(p, 0, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD);
}
namespace impl { namespace impl {
// below type indicate the data type used for buffer load inline asm // below type indicate the data type used for buffer load inline asm
// clang-format off // clang-format off
...@@ -874,240 +881,13 @@ CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0) ...@@ -874,240 +881,13 @@ CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0)
asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
} }
// buffer load i8
CK_TILE_DEVICE_EXTERN int8_t
llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
CK_TILE_DEVICE_EXTERN int8x2_t
llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
CK_TILE_DEVICE_EXTERN int8x4_t
llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
// buffer load i16
CK_TILE_DEVICE_EXTERN int16_t
llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16");
CK_TILE_DEVICE_EXTERN int16x2_t
llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16");
CK_TILE_DEVICE_EXTERN int16x4_t
llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16");
// buffer load i32
CK_TILE_DEVICE_EXTERN int32_t
llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
CK_TILE_DEVICE_EXTERN int32x2_t
llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
CK_TILE_DEVICE_EXTERN int32x4_t
llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
// buffer load fp16
CK_TILE_DEVICE_EXTERN _Float16
llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
CK_TILE_DEVICE_EXTERN fp16x2_t
llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
CK_TILE_DEVICE_EXTERN fp16x4_t
llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
// buffer load fp32
CK_TILE_DEVICE_EXTERN float
llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
CK_TILE_DEVICE_EXTERN fp32x2_t
llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
CK_TILE_DEVICE_EXTERN fp32x4_t
llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
// buffer store i8
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
// buffer store i16
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_i16x2(int16x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_i16x4(int16x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
// buffer store i32
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
// buffer store ui16
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_ui16x2(uint16x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_ui16x4(uint16x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
// buffer store fp16
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_fp16(_Float16 vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_fp16x2(fp16x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_fp16x4(fp16x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
// buffer store fp32
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_fp32(float vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_fp32x2(fp32x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
CK_TILE_DEVICE_EXTERN void
llvm_amdgcn_raw_buffer_store_fp32x4(fp32x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
// buffer atomic-add fp16 // buffer atomic-add fp16
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2( CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
fp16x2_t vdata, fp16x2_t vdata,
int32x4_t rsrc, int32x4_t rsrc,
index_t voffset, index_t voffset,
index_t soffset, index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16"); index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16.v4i32");
// buffer atomic-add i32 // buffer atomic-add i32
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32( CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
...@@ -1115,7 +895,7 @@ CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32( ...@@ -1115,7 +895,7 @@ CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
int32x4_t rsrc, int32x4_t rsrc,
index_t voffset, index_t voffset,
index_t soffset, index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32"); index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i3.v4i322");
// buffer atomic-add fp32 // buffer atomic-add fp32
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32( CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
...@@ -1123,7 +903,7 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32( ...@@ -1123,7 +903,7 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
int32x4_t rsrc, int32x4_t rsrc,
index_t voffset, index_t voffset,
index_t soffset, index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32"); index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32");
// buffer atomic-max fp64 // buffer atomic-max fp64
CK_TILE_DEVICE_EXTERN double CK_TILE_DEVICE_EXTERN double
...@@ -1131,7 +911,7 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, ...@@ -1131,7 +911,7 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
int32x4_t rsrc, // dst_wave_buffer_resource int32x4_t rsrc, // dst_wave_buffer_resource
int voffset, // dst_thread_addr_offset int voffset, // dst_thread_addr_offset
int soffset, // dst_wave_addr_offset int soffset, // dst_wave_addr_offset
int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64"); int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32");
// Direct loads from global to LDS. // Direct loads from global to LDS.
CK_TILE_DEVICE_EXTERN void CK_TILE_DEVICE_EXTERN void
...@@ -1141,7 +921,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, ...@@ -1141,7 +921,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
index_t voffset, index_t voffset,
index_t soffset, index_t soffset,
index_t offset, index_t offset,
index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds.v4i32");
template <bool pre_nop = false> template <bool pre_nop = false>
CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem, CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
...@@ -1186,7 +966,7 @@ enum struct amd_buffer_coherence_enum ...@@ -1186,7 +966,7 @@ enum struct amd_buffer_coherence_enum
template <index_t N, template <index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<int8_t, N> CK_TILE_DEVICE thread_buffer<int8_t, N>
amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_addr_offset, index_t src_thread_addr_offset,
index_t src_wave_addr_offset) index_t src_wave_addr_offset)
{ {
...@@ -1197,7 +977,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, ...@@ -1197,7 +977,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
if constexpr(N == 1) if constexpr(N == 1)
{ {
return bit_cast<rtn_type>(llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource, return bit_cast<rtn_type>(__builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
...@@ -1205,7 +985,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, ...@@ -1205,7 +985,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource, int16_t tmp = __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1214,7 +994,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, ...@@ -1214,7 +994,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource, int32_t tmp = __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1223,7 +1003,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, ...@@ -1223,7 +1003,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource, int32x2_t tmp = __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1232,7 +1012,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, ...@@ -1232,7 +1012,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
} }
else if constexpr(N == 16) else if constexpr(N == 16)
{ {
int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1240,12 +1020,12 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, ...@@ -1240,12 +1020,12 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
} }
else if constexpr(N == 32) else if constexpr(N == 32)
{ {
int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
int32x4_t tmp1 = int32x4_t tmp1 =
llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t), src_wave_addr_offset + 4 * sizeof(int32_t),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1258,22 +1038,22 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, ...@@ -1258,22 +1038,22 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
} }
else if constexpr(N == 64) else if constexpr(N == 64)
{ {
int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
int32x4_t tmp1 = int32x4_t tmp1 =
llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t), src_wave_addr_offset + 4 * sizeof(int32_t),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
int32x4_t tmp2 = int32x4_t tmp2 =
llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 8 * sizeof(int32_t), src_wave_addr_offset + 8 * sizeof(int32_t),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
int32x4_t tmp3 = int32x4_t tmp3 =
llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 12 * sizeof(int32_t), src_wave_addr_offset + 12 * sizeof(int32_t),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1296,7 +1076,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, ...@@ -1296,7 +1076,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
template <typename T, template <typename T,
index_t N, index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_addr_offset, index_t src_thread_addr_offset,
index_t src_wave_addr_offset) index_t src_wave_addr_offset)
{ {
...@@ -1319,7 +1099,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1319,7 +1099,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
if constexpr(N == 1) if constexpr(N == 1)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
llvm_amdgcn_raw_buffer_load_fp32(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
...@@ -1327,7 +1107,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1327,7 +1107,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
...@@ -1335,7 +1115,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1335,7 +1115,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
...@@ -1345,13 +1125,13 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1345,13 +1125,13 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
thread_buffer<float, 8> tmp; thread_buffer<float, 8> tmp;
tmp.template get_as<fp32x4_t>()(number<0>{}) = tmp.template get_as<fp32x4_t>()(number<0>{}) =
llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<1>{}) = tmp.template get_as<fp32x4_t>()(number<1>{}) =
llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float), src_wave_addr_offset + 4 * sizeof(float),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1363,25 +1143,25 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1363,25 +1143,25 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
thread_buffer<float, 16> tmp; thread_buffer<float, 16> tmp;
tmp.template get_as<fp32x4_t>()(number<0>{}) = tmp.template get_as<fp32x4_t>()(number<0>{}) =
llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<1>{}) = tmp.template get_as<fp32x4_t>()(number<1>{}) =
llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float), src_wave_addr_offset + 4 * sizeof(float),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<2>{}) = tmp.template get_as<fp32x4_t>()(number<2>{}) =
llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 8 * sizeof(float), src_wave_addr_offset + 8 * sizeof(float),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
tmp.template get_as<fp32x4_t>()(number<3>{}) = tmp.template get_as<fp32x4_t>()(number<3>{}) =
llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 12 * sizeof(float), src_wave_addr_offset + 12 * sizeof(float),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1394,7 +1174,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1394,7 +1174,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
if constexpr(N == 1) if constexpr(N == 1)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
llvm_amdgcn_raw_buffer_load_fp16(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
...@@ -1402,7 +1182,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1402,7 +1182,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
llvm_amdgcn_raw_buffer_load_fp16x2(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
...@@ -1410,7 +1190,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1410,7 +1190,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
...@@ -1418,7 +1198,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1418,7 +1198,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
// use fp32 load to mimic fp16 load // use fp32 load to mimic fp16 load
fp32x4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, fp32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1431,7 +1211,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1431,7 +1211,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
if constexpr(N == 1) if constexpr(N == 1)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
...@@ -1439,7 +1219,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1439,7 +1219,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
llvm_amdgcn_raw_buffer_load_i16x2(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
...@@ -1447,14 +1227,14 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe ...@@ -1447,14 +1227,14 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
return bit_cast<rtn_type>( return bit_cast<rtn_type>(
llvm_amdgcn_raw_buffer_load_i16x4(src_wave_buffer_resource, __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence))); static_cast<index_t>(coherence)));
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset, src_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
...@@ -1573,7 +1353,7 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem, ...@@ -1573,7 +1353,7 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
template <index_t N, template <index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data, CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, N> src_thread_data,
int32x4_t dst_wave_buffer_resource, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset, index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset) index_t dst_wave_addr_offset)
{ {
...@@ -1582,7 +1362,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, ...@@ -1582,7 +1362,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
if constexpr(N == 1) if constexpr(N == 1)
{ {
llvm_amdgcn_raw_buffer_store_i8(bit_cast<int8_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b8(bit_cast<int8_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1591,7 +1371,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, ...@@ -1591,7 +1371,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1599,7 +1379,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, ...@@ -1599,7 +1379,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<int32_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1607,7 +1387,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, ...@@ -1607,7 +1387,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
llvm_amdgcn_raw_buffer_store_i32x2(bit_cast<int32x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<int32x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1615,7 +1395,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, ...@@ -1615,7 +1395,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
} }
else if constexpr(N == 16) else if constexpr(N == 16)
{ {
llvm_amdgcn_raw_buffer_store_i32x4(bit_cast<int32x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b128(bit_cast<int32x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1623,14 +1403,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, ...@@ -1623,14 +1403,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
} }
else if constexpr(N == 32) else if constexpr(N == 32)
{ {
llvm_amdgcn_raw_buffer_store_i32x4( __builtin_amdgcn_raw_buffer_store_b128(
src_thread_data.template get_as<int32x4_t>()[number<0>{}], src_thread_data.template get_as<int32x4_t>()[number<0>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
llvm_amdgcn_raw_buffer_store_i32x4( __builtin_amdgcn_raw_buffer_store_b128(
src_thread_data.template get_as<int32x4_t>()[number<1>{}], src_thread_data.template get_as<int32x4_t>()[number<1>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
...@@ -1639,28 +1419,28 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t, ...@@ -1639,28 +1419,28 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
} }
else if constexpr(N == 64) else if constexpr(N == 64)
{ {
llvm_amdgcn_raw_buffer_store_i32x4( __builtin_amdgcn_raw_buffer_store_b128(
src_thread_data.template get_as<int32x4_t>()[number<0>{}], src_thread_data.template get_as<int32x4_t>()[number<0>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
llvm_amdgcn_raw_buffer_store_i32x4( __builtin_amdgcn_raw_buffer_store_b128(
src_thread_data.template get_as<int32x4_t>()[number<1>{}], src_thread_data.template get_as<int32x4_t>()[number<1>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset + sizeof(int32_t) * 4, dst_wave_addr_offset + sizeof(int32_t) * 4,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
llvm_amdgcn_raw_buffer_store_i32x4( __builtin_amdgcn_raw_buffer_store_b128(
src_thread_data.template get_as<int32x4_t>()[number<2>{}], src_thread_data.template get_as<int32x4_t>()[number<2>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset + sizeof(int32_t) * 8, dst_wave_addr_offset + sizeof(int32_t) * 8,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
llvm_amdgcn_raw_buffer_store_i32x4( __builtin_amdgcn_raw_buffer_store_b128(
src_thread_data.template get_as<int32x4_t>()[number<3>{}], src_thread_data.template get_as<int32x4_t>()[number<3>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
...@@ -1673,7 +1453,7 @@ template <typename T, ...@@ -1673,7 +1453,7 @@ template <typename T,
index_t N, index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default> amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_data, CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_data,
int32x4_t dst_wave_buffer_resource, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset, index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset) index_t dst_wave_addr_offset)
{ {
...@@ -1696,7 +1476,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1696,7 +1476,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
{ {
if constexpr(N == 1) if constexpr(N == 1)
{ {
llvm_amdgcn_raw_buffer_store_fp32(bit_cast<float>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<float>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1704,7 +1484,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1704,7 +1484,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
llvm_amdgcn_raw_buffer_store_fp32x2(bit_cast<fp32x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<fp32x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1712,7 +1492,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1712,7 +1492,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b128(bit_cast<fp32x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1720,13 +1500,13 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1720,13 +1500,13 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
llvm_amdgcn_raw_buffer_store_fp32x4( __builtin_amdgcn_raw_buffer_store_b128(
src_thread_data.template get_as<fp32x4_t>()[number<0>{}], src_thread_data.template get_as<fp32x4_t>()[number<0>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
llvm_amdgcn_raw_buffer_store_fp32x4( __builtin_amdgcn_raw_buffer_store_b128(
src_thread_data.template get_as<fp32x4_t>()[number<1>{}], src_thread_data.template get_as<fp32x4_t>()[number<1>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
...@@ -1738,7 +1518,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1738,7 +1518,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
{ {
if constexpr(N == 1) if constexpr(N == 1)
{ {
llvm_amdgcn_raw_buffer_store_fp16(bit_cast<_Float16>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b16(bit_cast<_Float16>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1746,7 +1526,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1746,7 +1526,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
llvm_amdgcn_raw_buffer_store_fp16x2(bit_cast<fp16x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<fp16x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1754,7 +1534,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1754,7 +1534,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
llvm_amdgcn_raw_buffer_store_fp16x4(bit_cast<fp16x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<fp16x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1765,19 +1545,19 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1765,19 +1545,19 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
#if 0 #if 0
thread_buffer<fp16_t, 8> tmp{src_thread_data}; thread_buffer<fp16_t, 8> tmp{src_thread_data};
llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}], __builtin_amdgcn_raw_buffer_store_b64(tmp.template get_as<fp16x4_t>()[number<0>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}], __builtin_amdgcn_raw_buffer_store_b64(tmp.template get_as<fp16x4_t>()[number<1>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset + 4 * sizeof(fp16_t), dst_wave_addr_offset + 4 * sizeof(fp16_t),
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
#else #else
llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b128(bit_cast<fp32x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1789,7 +1569,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1789,7 +1569,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
{ {
if constexpr(N == 1) if constexpr(N == 1)
{ {
llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1797,7 +1577,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1797,7 +1577,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
llvm_amdgcn_raw_buffer_store_i16x2(bit_cast<int16x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<int16x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1805,7 +1585,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1805,7 +1585,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
llvm_amdgcn_raw_buffer_store_i16x4(bit_cast<int16x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<int16x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1813,14 +1593,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1813,14 +1593,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
llvm_amdgcn_raw_buffer_store_i16x4( __builtin_amdgcn_raw_buffer_store_b64(
src_thread_data.template get_as<int16x4_t>()[number<0>{}], src_thread_data.template get_as<int16x4_t>()[number<0>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
llvm_amdgcn_raw_buffer_store_i16x4( __builtin_amdgcn_raw_buffer_store_b64(
src_thread_data.template get_as<int16x4_t>()[number<1>{}], src_thread_data.template get_as<int16x4_t>()[number<1>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
...@@ -1832,7 +1612,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1832,7 +1612,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
{ {
if constexpr(N == 1) if constexpr(N == 1)
{ {
llvm_amdgcn_raw_buffer_store_ui16(bit_cast<uint16_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b16(bit_cast<uint16_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1840,7 +1620,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1840,7 +1620,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 2) else if constexpr(N == 2)
{ {
llvm_amdgcn_raw_buffer_store_ui16x2(bit_cast<uint16x2_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b32(bit_cast<uint16x2_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1848,7 +1628,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1848,7 +1628,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 4) else if constexpr(N == 4)
{ {
llvm_amdgcn_raw_buffer_store_ui16x4(bit_cast<uint16x4_t>(src_thread_data), __builtin_amdgcn_raw_buffer_store_b64(bit_cast<uint16x4_t>(src_thread_data),
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
...@@ -1856,14 +1636,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d ...@@ -1856,14 +1636,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
llvm_amdgcn_raw_buffer_store_ui16x4( __builtin_amdgcn_raw_buffer_store_b64(
src_thread_data.template get_as<uint16x4_t>()[number<0>{}], src_thread_data.template get_as<uint16x4_t>()[number<0>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
static_cast<index_t>(coherence)); static_cast<index_t>(coherence));
llvm_amdgcn_raw_buffer_store_ui16x4( __builtin_amdgcn_raw_buffer_store_b64(
src_thread_data.template get_as<uint16x4_t>()[number<1>{}], src_thread_data.template get_as<uint16x4_t>()[number<1>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
...@@ -2159,8 +1939,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave, ...@@ -2159,8 +1939,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
bool src_thread_element_valid, bool src_thread_element_valid,
index_t src_element_space_size) index_t src_element_space_size)
{ {
const int32x4_t src_wave_buffer_resource = const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T)); make_wave_buffer_resource_new(p_src_wave, src_element_space_size * sizeof(T));
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
...@@ -2198,8 +1978,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave, ...@@ -2198,8 +1978,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
index_t src_element_space_size, index_t src_element_space_size,
T customized_value) T customized_value)
{ {
const int32x4_t src_wave_buffer_resource = const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T)); make_wave_buffer_resource_new(p_src_wave, src_element_space_size * sizeof(T));
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
...@@ -2225,8 +2005,8 @@ CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst, ...@@ -2225,8 +2005,8 @@ CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
index_t is_valid_element = 0, index_t is_valid_element = 0,
bool_constant<pre_nop> = {}) bool_constant<pre_nop> = {})
{ {
const int32x4_t src_wave_buffer_resource = const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T)); make_wave_buffer_resource_new(p_src_wave, src_element_space_size * sizeof(T));
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T); index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
...@@ -2248,7 +2028,7 @@ template <typename T, ...@@ -2248,7 +2028,7 @@ template <typename T,
bool oob_conditional_check = true, bool oob_conditional_check = true,
bool pre_nop = false> bool pre_nop = false>
CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst, CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
const int32x4_t src_wave_buffer_resource, const __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_element_offset, index_t src_thread_element_offset,
index_t src_linear_element_offset, index_t src_linear_element_offset,
index_t is_valid_element = 0, index_t is_valid_element = 0,
...@@ -2282,8 +2062,8 @@ CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem, ...@@ -2282,8 +2062,8 @@ CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
index_t src_element_space_size, index_t src_element_space_size,
bool_constant<pre_nop> = {}) bool_constant<pre_nop> = {})
{ {
const int32x4_t src_wave_buffer_resource = const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T)); make_wave_buffer_resourcep_new(p_src_wave, src_element_space_size * sizeof(T));
index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T); index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
...@@ -2302,7 +2082,7 @@ template <typename T, ...@@ -2302,7 +2082,7 @@ template <typename T,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
bool pre_nop = false> bool pre_nop = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem, CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
const int32x4_t src_wave_buffer_resource, const __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_element_offset, index_t src_thread_element_offset,
index_t src_linear_element_offset, index_t src_linear_element_offset,
bool_constant<pre_nop> = {}) bool_constant<pre_nop> = {})
...@@ -2324,7 +2104,7 @@ template <typename T, ...@@ -2324,7 +2104,7 @@ template <typename T,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default, amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default,
bool oob_conditional_check = false> bool oob_conditional_check = false>
CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem, CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T* smem,
const int32x4_t src_wave_buffer_resource, const __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_element_offset, index_t src_thread_element_offset,
index_t src_linear_element_offset, index_t src_linear_element_offset,
bool is_valid_element, bool is_valid_element,
...@@ -2356,8 +2136,8 @@ CK_TILE_DEVICE void amd_buffer_store(const thread_buffer<T, N>& src_thread_data, ...@@ -2356,8 +2136,8 @@ CK_TILE_DEVICE void amd_buffer_store(const thread_buffer<T, N>& src_thread_data,
const bool dst_thread_element_valid, const bool dst_thread_element_valid,
const index_t dst_element_space_size) const index_t dst_element_space_size)
{ {
const int32x4_t dst_wave_buffer_resource = const __amdgpu_buffer_rsrc_t dst_wave_buffer_resource =
make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); make_wave_buffer_resource_new(p_dst_wave, dst_element_space_size * sizeof(T));
index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
...@@ -2398,8 +2178,8 @@ CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_d ...@@ -2398,8 +2178,8 @@ CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_d
const bool dst_thread_element_valid, const bool dst_thread_element_valid,
const index_t dst_element_space_size) const index_t dst_element_space_size)
{ {
const int32x4_t dst_wave_buffer_resource = const __amdgpu_buffer_rsrc_t dst_wave_buffer_resource =
make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); make_wave_buffer_resource_new(p_dst_wave, dst_element_space_size * sizeof(T));
index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T); index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
...@@ -2550,4 +2330,4 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, ...@@ -2550,4 +2330,4 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
#endif #endif
} }
} // namespace ck_tile } // namespace ck_tile
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment