Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
7567a2bd
Commit
7567a2bd
authored
Feb 11, 2025
by
illsilin
Browse files
replace buffer load/store intrinsics with built-ins in ck_tile
parent
36eb9b69
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
92 additions
and
312 deletions
+92
-312
include/ck_tile/core/arch/amd_buffer_addressing.hpp
include/ck_tile/core/arch/amd_buffer_addressing.hpp
+92
-312
No files found.
include/ck_tile/core/arch/amd_buffer_addressing.hpp
View file @
7567a2bd
...
@@ -34,6 +34,13 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
...
@@ -34,6 +34,13 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
return
r
;
return
r
;
}
}
CK_TILE_DEVICE
__amdgpu_buffer_rsrc_t
make_wave_buffer_resource_new
(
const
void
*
ptr
,
uint32_t
size
=
0xffffffff
)
{
auto
p
=
const_cast
<
remove_cv_t
<
void
>*>
(
ptr
);
return
__builtin_amdgcn_make_buffer_rsrc
(
p
,
0
,
size
,
CK_TILE_BUFFER_RESOURCE_3RD_DWORD
);
}
namespace
impl
{
namespace
impl
{
// below type indicate the data type used for buffer load inline asm
// below type indicate the data type used for buffer load inline asm
// clang-format off
// clang-format off
...
@@ -874,240 +881,13 @@ CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0)
...
@@ -874,240 +881,13 @@ CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt = 0)
asm
volatile
(
"s_waitcnt vmcnt(%0)"
:
:
"n"
(
cnt
)
:
"memory"
);
asm
volatile
(
"s_waitcnt vmcnt(%0)"
:
:
"n"
(
cnt
)
:
"memory"
);
}
}
// buffer load i8
CK_TILE_DEVICE_EXTERN
int8_t
llvm_amdgcn_raw_buffer_load_i8
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.i8"
);
CK_TILE_DEVICE_EXTERN
int8x2_t
llvm_amdgcn_raw_buffer_load_i8x2
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v2i8"
);
CK_TILE_DEVICE_EXTERN
int8x4_t
llvm_amdgcn_raw_buffer_load_i8x4
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v4i8"
);
// buffer load i16
CK_TILE_DEVICE_EXTERN
int16_t
llvm_amdgcn_raw_buffer_load_i16
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.i16"
);
CK_TILE_DEVICE_EXTERN
int16x2_t
llvm_amdgcn_raw_buffer_load_i16x2
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v2i16"
);
CK_TILE_DEVICE_EXTERN
int16x4_t
llvm_amdgcn_raw_buffer_load_i16x4
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v4i16"
);
// buffer load i32
CK_TILE_DEVICE_EXTERN
int32_t
llvm_amdgcn_raw_buffer_load_i32
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.i32"
);
CK_TILE_DEVICE_EXTERN
int32x2_t
llvm_amdgcn_raw_buffer_load_i32x2
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v2i32"
);
CK_TILE_DEVICE_EXTERN
int32x4_t
llvm_amdgcn_raw_buffer_load_i32x4
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v4i32"
);
// buffer load fp16
CK_TILE_DEVICE_EXTERN
_Float16
llvm_amdgcn_raw_buffer_load_fp16
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.f16"
);
CK_TILE_DEVICE_EXTERN
fp16x2_t
llvm_amdgcn_raw_buffer_load_fp16x2
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v2f16"
);
CK_TILE_DEVICE_EXTERN
fp16x4_t
llvm_amdgcn_raw_buffer_load_fp16x4
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v4f16"
);
// buffer load fp32
CK_TILE_DEVICE_EXTERN
float
llvm_amdgcn_raw_buffer_load_fp32
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.f32"
);
CK_TILE_DEVICE_EXTERN
fp32x2_t
llvm_amdgcn_raw_buffer_load_fp32x2
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v2f32"
);
CK_TILE_DEVICE_EXTERN
fp32x4_t
llvm_amdgcn_raw_buffer_load_fp32x4
(
int32x4_t
srsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.load.v4f32"
);
// buffer store i8
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_i8
(
int8_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.i8"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_i8x2
(
int8x2_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v2i8"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_i8x4
(
int8x4_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v4i8"
);
// buffer store i16
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_i16
(
int16_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.i16"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_i16x2
(
int16x2_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v2i16"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_i16x4
(
int16x4_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v4i16"
);
// buffer store i32
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_i32
(
int32_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.i32"
);
// buffer store ui16
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_ui16
(
uint16_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.i16"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_ui16x2
(
uint16x2_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v2i16"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_ui16x4
(
uint16x4_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v4i16"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_i32x2
(
int32x2_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v2i32"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_i32x4
(
int32x4_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v4i32"
);
// buffer store fp16
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_fp16
(
_Float16
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.f16"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_fp16x2
(
fp16x2_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v2f16"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_fp16x4
(
fp16x4_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v4f16"
);
// buffer store fp32
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_fp32
(
float
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.f32"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_fp32x2
(
fp32x2_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v2f32"
);
CK_TILE_DEVICE_EXTERN
void
llvm_amdgcn_raw_buffer_store_fp32x4
(
fp32x4_t
vdata
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.store.v4f32"
);
// buffer atomic-add fp16
// buffer atomic-add fp16
CK_TILE_DEVICE_EXTERN
fp16x2_t
llvm_amdgcn_raw_buffer_atomic_add_fp16x2
(
CK_TILE_DEVICE_EXTERN
fp16x2_t
llvm_amdgcn_raw_buffer_atomic_add_fp16x2
(
fp16x2_t
vdata
,
fp16x2_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
voffset
,
index_t
soffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16"
);
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16
.v4i32
"
);
// buffer atomic-add i32
// buffer atomic-add i32
CK_TILE_DEVICE_EXTERN
int32_t
llvm_amdgcn_raw_buffer_atomic_add_i32
(
CK_TILE_DEVICE_EXTERN
int32_t
llvm_amdgcn_raw_buffer_atomic_add_i32
(
...
@@ -1115,7 +895,7 @@ CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
...
@@ -1115,7 +895,7 @@ CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
int32x4_t
rsrc
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
voffset
,
index_t
soffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.add.i32"
);
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.add.i3
.v4i32
2"
);
// buffer atomic-add fp32
// buffer atomic-add fp32
CK_TILE_DEVICE_EXTERN
float
llvm_amdgcn_raw_buffer_atomic_add_fp32
(
CK_TILE_DEVICE_EXTERN
float
llvm_amdgcn_raw_buffer_atomic_add_fp32
(
...
@@ -1123,7 +903,7 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
...
@@ -1123,7 +903,7 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
int32x4_t
rsrc
,
int32x4_t
rsrc
,
index_t
voffset
,
index_t
voffset
,
index_t
soffset
,
index_t
soffset
,
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32"
);
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32
.v4i32
"
);
// buffer atomic-max fp64
// buffer atomic-max fp64
CK_TILE_DEVICE_EXTERN
double
CK_TILE_DEVICE_EXTERN
double
...
@@ -1131,7 +911,7 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
...
@@ -1131,7 +911,7 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
int32x4_t
rsrc
,
// dst_wave_buffer_resource
int32x4_t
rsrc
,
// dst_wave_buffer_resource
int
voffset
,
// dst_thread_addr_offset
int
voffset
,
// dst_thread_addr_offset
int
soffset
,
// dst_wave_addr_offset
int
soffset
,
// dst_wave_addr_offset
int
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64"
);
int
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64
.v4i32
"
);
// Direct loads from global to LDS.
// Direct loads from global to LDS.
CK_TILE_DEVICE_EXTERN
void
CK_TILE_DEVICE_EXTERN
void
...
@@ -1141,7 +921,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
...
@@ -1141,7 +921,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
index_t
voffset
,
index_t
voffset
,
index_t
soffset
,
index_t
soffset
,
index_t
offset
,
index_t
offset
,
index_t
aux
)
__asm
(
"llvm.amdgcn.raw.buffer.load.lds"
);
index_t
aux
)
__asm
(
"llvm.amdgcn.raw.buffer.load.lds
.v4i32
"
);
template
<
bool
pre_nop
=
false
>
template
<
bool
pre_nop
=
false
>
CK_TILE_DEVICE
void
async_buffer_load_dword_v
(
void
*
smem
,
CK_TILE_DEVICE
void
async_buffer_load_dword_v
(
void
*
smem
,
...
@@ -1186,7 +966,7 @@ enum struct amd_buffer_coherence_enum
...
@@ -1186,7 +966,7 @@ enum struct amd_buffer_coherence_enum
template
<
index_t
N
,
template
<
index_t
N
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
CK_TILE_DEVICE
thread_buffer
<
int8_t
,
N
>
CK_TILE_DEVICE
thread_buffer
<
int8_t
,
N
>
amd_buffer_load_impl_with_bytes
(
int32x4
_t
src_wave_buffer_resource
,
amd_buffer_load_impl_with_bytes
(
__amdgpu_buffer_rsrc
_t
src_wave_buffer_resource
,
index_t
src_thread_addr_offset
,
index_t
src_thread_addr_offset
,
index_t
src_wave_addr_offset
)
index_t
src_wave_addr_offset
)
{
{
...
@@ -1197,7 +977,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
...
@@ -1197,7 +977,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
i
8
(
src_wave_buffer_resource
,
return
bit_cast
<
rtn_type
>
(
__builtin
_amdgcn_raw_buffer_load_
b
8
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
...
@@ -1205,7 +985,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
...
@@ -1205,7 +985,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
int16_t
tmp
=
llvm
_amdgcn_raw_buffer_load_
i
16
(
src_wave_buffer_resource
,
int16_t
tmp
=
__builtin
_amdgcn_raw_buffer_load_
b
16
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1214,7 +994,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
...
@@ -1214,7 +994,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
}
}
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
int32_t
tmp
=
llvm
_amdgcn_raw_buffer_load_
i
32
(
src_wave_buffer_resource
,
int32_t
tmp
=
__builtin
_amdgcn_raw_buffer_load_
b
32
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1223,7 +1003,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
...
@@ -1223,7 +1003,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
int32x2_t
tmp
=
llvm
_amdgcn_raw_buffer_load_
i32x2
(
src_wave_buffer_resource
,
int32x2_t
tmp
=
__builtin
_amdgcn_raw_buffer_load_
b64
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1232,7 +1012,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
...
@@ -1232,7 +1012,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
}
}
else
if
constexpr
(
N
==
16
)
else
if
constexpr
(
N
==
16
)
{
{
int32x4_t
tmp
=
llvm
_amdgcn_raw_buffer_load_
i32x4
(
src_wave_buffer_resource
,
int32x4_t
tmp
=
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1240,12 +1020,12 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
...
@@ -1240,12 +1020,12 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
}
}
else
if
constexpr
(
N
==
32
)
else
if
constexpr
(
N
==
32
)
{
{
int32x4_t
tmp0
=
llvm
_amdgcn_raw_buffer_load_
i32x4
(
src_wave_buffer_resource
,
int32x4_t
tmp0
=
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
int32x4_t
tmp1
=
int32x4_t
tmp1
=
llvm
_amdgcn_raw_buffer_load_
i32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1258,22 +1038,22 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
...
@@ -1258,22 +1038,22 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
}
}
else
if
constexpr
(
N
==
64
)
else
if
constexpr
(
N
==
64
)
{
{
int32x4_t
tmp0
=
llvm
_amdgcn_raw_buffer_load_
i32x4
(
src_wave_buffer_resource
,
int32x4_t
tmp0
=
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
int32x4_t
tmp1
=
int32x4_t
tmp1
=
llvm
_amdgcn_raw_buffer_load_
i32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
int32x4_t
tmp2
=
int32x4_t
tmp2
=
llvm
_amdgcn_raw_buffer_load_
i32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
+
8
*
sizeof
(
int32_t
),
src_wave_addr_offset
+
8
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
int32x4_t
tmp3
=
int32x4_t
tmp3
=
llvm
_amdgcn_raw_buffer_load_
i32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
+
12
*
sizeof
(
int32_t
),
src_wave_addr_offset
+
12
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1296,7 +1076,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
...
@@ -1296,7 +1076,7 @@ amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
template
<
typename
T
,
template
<
typename
T
,
index_t
N
,
index_t
N
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
CK_TILE_DEVICE
thread_buffer
<
T
,
N
>
amd_buffer_load_impl
(
int32x4
_t
src_wave_buffer_resource
,
CK_TILE_DEVICE
thread_buffer
<
T
,
N
>
amd_buffer_load_impl
(
__amdgpu_buffer_rsrc
_t
src_wave_buffer_resource
,
index_t
src_thread_addr_offset
,
index_t
src_thread_addr_offset
,
index_t
src_wave_addr_offset
)
index_t
src_wave_addr_offset
)
{
{
...
@@ -1319,7 +1099,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1319,7 +1099,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
return
bit_cast
<
rtn_type
>
(
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
fp
32
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b
32
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
...
@@ -1327,7 +1107,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1327,7 +1107,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
return
bit_cast
<
rtn_type
>
(
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
fp32x2
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b64
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
...
@@ -1335,7 +1115,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1335,7 +1115,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
return
bit_cast
<
rtn_type
>
(
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
fp32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
...
@@ -1345,13 +1125,13 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1345,13 +1125,13 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
thread_buffer
<
float
,
8
>
tmp
;
thread_buffer
<
float
,
8
>
tmp
;
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
0
>
{})
=
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
0
>
{})
=
llvm
_amdgcn_raw_buffer_load_
fp32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
1
>
{})
=
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
1
>
{})
=
llvm
_amdgcn_raw_buffer_load_
fp32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
float
),
src_wave_addr_offset
+
4
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1363,25 +1143,25 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1363,25 +1143,25 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
thread_buffer
<
float
,
16
>
tmp
;
thread_buffer
<
float
,
16
>
tmp
;
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
0
>
{})
=
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
0
>
{})
=
llvm
_amdgcn_raw_buffer_load_
fp32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
1
>
{})
=
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
1
>
{})
=
llvm
_amdgcn_raw_buffer_load_
fp32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
float
),
src_wave_addr_offset
+
4
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
2
>
{})
=
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
2
>
{})
=
llvm
_amdgcn_raw_buffer_load_
fp32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
+
8
*
sizeof
(
float
),
src_wave_addr_offset
+
8
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
3
>
{})
=
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
3
>
{})
=
llvm
_amdgcn_raw_buffer_load_
fp32x4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
+
12
*
sizeof
(
float
),
src_wave_addr_offset
+
12
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1394,7 +1174,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1394,7 +1174,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
return
bit_cast
<
rtn_type
>
(
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
fp
16
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b
16
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
...
@@ -1402,7 +1182,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1402,7 +1182,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
return
bit_cast
<
rtn_type
>
(
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
fp16x
2
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b3
2
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
...
@@ -1410,7 +1190,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1410,7 +1190,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
return
bit_cast
<
rtn_type
>
(
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
fp16x
4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b6
4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
...
@@ -1418,7 +1198,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1418,7 +1198,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
// use fp32 load to mimic fp16 load
// use fp32 load to mimic fp16 load
fp32x4_t
tmp
=
llvm
_amdgcn_raw_buffer_load_
fp32x4
(
src_wave_buffer_resource
,
fp32x4_t
tmp
=
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1431,7 +1211,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1431,7 +1211,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
return
bit_cast
<
rtn_type
>
(
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
i
16
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b
16
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
...
@@ -1439,7 +1219,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1439,7 +1219,7 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
return
bit_cast
<
rtn_type
>
(
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
i16x
2
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b3
2
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
...
@@ -1447,14 +1227,14 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
...
@@ -1447,14 +1227,14 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
return
bit_cast
<
rtn_type
>
(
return
bit_cast
<
rtn_type
>
(
llvm
_amdgcn_raw_buffer_load_
i16x
4
(
src_wave_buffer_resource
,
__builtin
_amdgcn_raw_buffer_load_
b6
4
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
static_cast
<
index_t
>
(
coherence
)));
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
int32x4_t
tmp
=
llvm
_amdgcn_raw_buffer_load_
i32x4
(
src_wave_buffer_resource
,
int32x4_t
tmp
=
__builtin
_amdgcn_raw_buffer_load_
b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_thread_addr_offset
,
src_wave_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
...
@@ -1573,7 +1353,7 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
...
@@ -1573,7 +1353,7 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
template
<
index_t
N
,
template
<
index_t
N
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
CK_TILE_DEVICE
void
amd_buffer_store_impl_with_bytes
(
const
thread_buffer
<
int8_t
,
N
>
src_thread_data
,
CK_TILE_DEVICE
void
amd_buffer_store_impl_with_bytes
(
const
thread_buffer
<
int8_t
,
N
>
src_thread_data
,
int32x4
_t
dst_wave_buffer_resource
,
__amdgpu_buffer_rsrc
_t
dst_wave_buffer_resource
,
index_t
dst_thread_addr_offset
,
index_t
dst_thread_addr_offset
,
index_t
dst_wave_addr_offset
)
index_t
dst_wave_addr_offset
)
{
{
...
@@ -1582,7 +1362,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
...
@@ -1582,7 +1362,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
llvm
_amdgcn_raw_buffer_store_
i
8
(
bit_cast
<
int8_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b
8
(
bit_cast
<
int8_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1591,7 +1371,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
...
@@ -1591,7 +1371,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
llvm
_amdgcn_raw_buffer_store_
i
16
(
bit_cast
<
int16_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b
16
(
bit_cast
<
int16_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1599,7 +1379,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
...
@@ -1599,7 +1379,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
}
}
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
llvm
_amdgcn_raw_buffer_store_
i
32
(
bit_cast
<
int32_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b
32
(
bit_cast
<
int32_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1607,7 +1387,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
...
@@ -1607,7 +1387,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
llvm
_amdgcn_raw_buffer_store_
i32x2
(
bit_cast
<
int32x2_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b64
(
bit_cast
<
int32x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1615,7 +1395,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
...
@@ -1615,7 +1395,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
}
}
else
if
constexpr
(
N
==
16
)
else
if
constexpr
(
N
==
16
)
{
{
llvm
_amdgcn_raw_buffer_store_
i32x4
(
bit_cast
<
int32x4_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b128
(
bit_cast
<
int32x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1623,14 +1403,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
...
@@ -1623,14 +1403,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
}
}
else
if
constexpr
(
N
==
32
)
else
if
constexpr
(
N
==
32
)
{
{
llvm
_amdgcn_raw_buffer_store_
i32x4
(
__builtin
_amdgcn_raw_buffer_store_
b128
(
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
0
>
{}],
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
0
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
llvm
_amdgcn_raw_buffer_store_
i32x4
(
__builtin
_amdgcn_raw_buffer_store_
b128
(
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
1
>
{}],
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
1
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
...
@@ -1639,28 +1419,28 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
...
@@ -1639,28 +1419,28 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
}
}
else
if
constexpr
(
N
==
64
)
else
if
constexpr
(
N
==
64
)
{
{
llvm
_amdgcn_raw_buffer_store_
i32x4
(
__builtin
_amdgcn_raw_buffer_store_
b128
(
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
0
>
{}],
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
0
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
llvm
_amdgcn_raw_buffer_store_
i32x4
(
__builtin
_amdgcn_raw_buffer_store_
b128
(
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
1
>
{}],
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
1
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
+
sizeof
(
int32_t
)
*
4
,
dst_wave_addr_offset
+
sizeof
(
int32_t
)
*
4
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
llvm
_amdgcn_raw_buffer_store_
i32x4
(
__builtin
_amdgcn_raw_buffer_store_
b128
(
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
2
>
{}],
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
2
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
+
sizeof
(
int32_t
)
*
8
,
dst_wave_addr_offset
+
sizeof
(
int32_t
)
*
8
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
llvm
_amdgcn_raw_buffer_store_
i32x4
(
__builtin
_amdgcn_raw_buffer_store_
b128
(
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
3
>
{}],
src_thread_data
.
template
get_as
<
int32x4_t
>()[
number
<
3
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
...
@@ -1673,7 +1453,7 @@ template <typename T,
...
@@ -1673,7 +1453,7 @@ template <typename T,
index_t
N
,
index_t
N
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
CK_TILE_DEVICE
void
amd_buffer_store_impl
(
const
thread_buffer
<
T
,
N
>
src_thread_data
,
CK_TILE_DEVICE
void
amd_buffer_store_impl
(
const
thread_buffer
<
T
,
N
>
src_thread_data
,
int32x4
_t
dst_wave_buffer_resource
,
__amdgpu_buffer_rsrc
_t
dst_wave_buffer_resource
,
index_t
dst_thread_addr_offset
,
index_t
dst_thread_addr_offset
,
index_t
dst_wave_addr_offset
)
index_t
dst_wave_addr_offset
)
{
{
...
@@ -1696,7 +1476,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1696,7 +1476,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
{
{
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
llvm
_amdgcn_raw_buffer_store_
fp
32
(
bit_cast
<
float
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b
32
(
bit_cast
<
float
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1704,7 +1484,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1704,7 +1484,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
llvm
_amdgcn_raw_buffer_store_
fp32x2
(
bit_cast
<
fp32x2_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b64
(
bit_cast
<
fp32x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1712,7 +1492,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1712,7 +1492,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
llvm
_amdgcn_raw_buffer_store_
fp32x4
(
bit_cast
<
fp32x4_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b128
(
bit_cast
<
fp32x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1720,13 +1500,13 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1720,13 +1500,13 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
llvm
_amdgcn_raw_buffer_store_
fp32x4
(
__builtin
_amdgcn_raw_buffer_store_
b128
(
src_thread_data
.
template
get_as
<
fp32x4_t
>()[
number
<
0
>
{}],
src_thread_data
.
template
get_as
<
fp32x4_t
>()[
number
<
0
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
llvm
_amdgcn_raw_buffer_store_
fp32x4
(
__builtin
_amdgcn_raw_buffer_store_
b128
(
src_thread_data
.
template
get_as
<
fp32x4_t
>()[
number
<
1
>
{}],
src_thread_data
.
template
get_as
<
fp32x4_t
>()[
number
<
1
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
...
@@ -1738,7 +1518,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1738,7 +1518,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
{
{
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
llvm
_amdgcn_raw_buffer_store_
fp
16
(
bit_cast
<
_Float16
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b
16
(
bit_cast
<
_Float16
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1746,7 +1526,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1746,7 +1526,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
llvm
_amdgcn_raw_buffer_store_
fp16x
2
(
bit_cast
<
fp16x2_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b3
2
(
bit_cast
<
fp16x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1754,7 +1534,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1754,7 +1534,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
llvm
_amdgcn_raw_buffer_store_
fp16x
4
(
bit_cast
<
fp16x4_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b6
4
(
bit_cast
<
fp16x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1765,19 +1545,19 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1765,19 +1545,19 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
#if 0
#if 0
thread_buffer<fp16_t, 8> tmp{src_thread_data};
thread_buffer<fp16_t, 8> tmp{src_thread_data};
llvm
_amdgcn_raw_buffer_store_
fp16x
4(tmp.template get_as<fp16x4_t>()[number<0>{}],
__builtin
_amdgcn_raw_buffer_store_
b6
4(tmp.template get_as<fp16x4_t>()[number<0>{}],
dst_wave_buffer_resource,
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_thread_addr_offset,
dst_wave_addr_offset,
dst_wave_addr_offset,
static_cast<index_t>(coherence));
static_cast<index_t>(coherence));
llvm
_amdgcn_raw_buffer_store_
fp16x
4(tmp.template get_as<fp16x4_t>()[number<1>{}],
__builtin
_amdgcn_raw_buffer_store_
b6
4(tmp.template get_as<fp16x4_t>()[number<1>{}],
dst_wave_buffer_resource,
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_thread_addr_offset,
dst_wave_addr_offset + 4 * sizeof(fp16_t),
dst_wave_addr_offset + 4 * sizeof(fp16_t),
static_cast<index_t>(coherence));
static_cast<index_t>(coherence));
#else
#else
llvm
_amdgcn_raw_buffer_store_
fp32x4
(
bit_cast
<
fp32x4_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b128
(
bit_cast
<
fp32x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1789,7 +1569,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1789,7 +1569,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
{
{
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
llvm
_amdgcn_raw_buffer_store_
i
16
(
bit_cast
<
int16_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b
16
(
bit_cast
<
int16_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1797,7 +1577,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1797,7 +1577,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
llvm
_amdgcn_raw_buffer_store_
i16x
2
(
bit_cast
<
int16x2_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b3
2
(
bit_cast
<
int16x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1805,7 +1585,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1805,7 +1585,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
llvm
_amdgcn_raw_buffer_store_
i16x
4
(
bit_cast
<
int16x4_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b6
4
(
bit_cast
<
int16x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1813,14 +1593,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1813,14 +1593,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
llvm
_amdgcn_raw_buffer_store_
i16x
4
(
__builtin
_amdgcn_raw_buffer_store_
b6
4
(
src_thread_data
.
template
get_as
<
int16x4_t
>()[
number
<
0
>
{}],
src_thread_data
.
template
get_as
<
int16x4_t
>()[
number
<
0
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
llvm
_amdgcn_raw_buffer_store_
i16x
4
(
__builtin
_amdgcn_raw_buffer_store_
b6
4
(
src_thread_data
.
template
get_as
<
int16x4_t
>()[
number
<
1
>
{}],
src_thread_data
.
template
get_as
<
int16x4_t
>()[
number
<
1
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
...
@@ -1832,7 +1612,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1832,7 +1612,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
{
{
if
constexpr
(
N
==
1
)
if
constexpr
(
N
==
1
)
{
{
llvm
_amdgcn_raw_buffer_store_
ui
16
(
bit_cast
<
uint16_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b
16
(
bit_cast
<
uint16_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1840,7 +1620,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1840,7 +1620,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
2
)
else
if
constexpr
(
N
==
2
)
{
{
llvm
_amdgcn_raw_buffer_store_
ui16x
2
(
bit_cast
<
uint16x2_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b3
2
(
bit_cast
<
uint16x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1848,7 +1628,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1848,7 +1628,7 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
4
)
else
if
constexpr
(
N
==
4
)
{
{
llvm
_amdgcn_raw_buffer_store_
ui16x
4
(
bit_cast
<
uint16x4_t
>
(
src_thread_data
),
__builtin
_amdgcn_raw_buffer_store_
b6
4
(
bit_cast
<
uint16x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
...
@@ -1856,14 +1636,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
...
@@ -1856,14 +1636,14 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
}
}
else
if
constexpr
(
N
==
8
)
else
if
constexpr
(
N
==
8
)
{
{
llvm
_amdgcn_raw_buffer_store_
ui16x
4
(
__builtin
_amdgcn_raw_buffer_store_
b6
4
(
src_thread_data
.
template
get_as
<
uint16x4_t
>()[
number
<
0
>
{}],
src_thread_data
.
template
get_as
<
uint16x4_t
>()[
number
<
0
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
static_cast
<
index_t
>
(
coherence
));
llvm
_amdgcn_raw_buffer_store_
ui16x
4
(
__builtin
_amdgcn_raw_buffer_store_
b6
4
(
src_thread_data
.
template
get_as
<
uint16x4_t
>()[
number
<
1
>
{}],
src_thread_data
.
template
get_as
<
uint16x4_t
>()[
number
<
1
>
{}],
dst_wave_buffer_resource
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_thread_addr_offset
,
...
@@ -2159,8 +1939,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
...
@@ -2159,8 +1939,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
bool
src_thread_element_valid
,
bool
src_thread_element_valid
,
index_t
src_element_space_size
)
index_t
src_element_space_size
)
{
{
const
int32x4
_t
src_wave_buffer_resource
=
const
__amdgpu_buffer_rsrc
_t
src_wave_buffer_resource
=
make_wave_buffer_resource
(
p_src_wave
,
src_element_space_size
*
sizeof
(
T
));
make_wave_buffer_resource
_new
(
p_src_wave
,
src_element_space_size
*
sizeof
(
T
));
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
...
@@ -2198,8 +1978,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
...
@@ -2198,8 +1978,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
index_t
src_element_space_size
,
index_t
src_element_space_size
,
T
customized_value
)
T
customized_value
)
{
{
const
int32x4
_t
src_wave_buffer_resource
=
const
__amdgpu_buffer_rsrc
_t
src_wave_buffer_resource
=
make_wave_buffer_resource
(
p_src_wave
,
src_element_space_size
*
sizeof
(
T
));
make_wave_buffer_resource
_new
(
p_src_wave
,
src_element_space_size
*
sizeof
(
T
));
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
...
@@ -2225,8 +2005,8 @@ CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
...
@@ -2225,8 +2005,8 @@ CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer<T, N>& dst,
index_t
is_valid_element
=
0
,
index_t
is_valid_element
=
0
,
bool_constant
<
pre_nop
>
=
{})
bool_constant
<
pre_nop
>
=
{})
{
{
const
int32x4
_t
src_wave_buffer_resource
=
const
__amdgpu_buffer_rsrc
_t
src_wave_buffer_resource
=
make_wave_buffer_resource
(
p_src_wave
,
src_element_space_size
*
sizeof
(
T
));
make_wave_buffer_resource
_new
(
p_src_wave
,
src_element_space_size
*
sizeof
(
T
));
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
index_t
src_linear_addr_offset
=
src_linear_element_offset
*
sizeof
(
T
);
index_t
src_linear_addr_offset
=
src_linear_element_offset
*
sizeof
(
T
);
...
@@ -2248,7 +2028,7 @@ template <typename T,
...
@@ -2248,7 +2028,7 @@ template <typename T,
bool
oob_conditional_check
=
true
,
bool
oob_conditional_check
=
true
,
bool
pre_nop
=
false
>
bool
pre_nop
=
false
>
CK_TILE_DEVICE
void
amd_buffer_load_raw
(
thread_buffer
<
T
,
N
>&
dst
,
CK_TILE_DEVICE
void
amd_buffer_load_raw
(
thread_buffer
<
T
,
N
>&
dst
,
const
int32x4
_t
src_wave_buffer_resource
,
const
__amdgpu_buffer_rsrc
_t
src_wave_buffer_resource
,
index_t
src_thread_element_offset
,
index_t
src_thread_element_offset
,
index_t
src_linear_element_offset
,
index_t
src_linear_element_offset
,
index_t
is_valid_element
=
0
,
index_t
is_valid_element
=
0
,
...
@@ -2282,8 +2062,8 @@ CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
...
@@ -2282,8 +2062,8 @@ CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T* smem,
index_t
src_element_space_size
,
index_t
src_element_space_size
,
bool_constant
<
pre_nop
>
=
{})
bool_constant
<
pre_nop
>
=
{})
{
{
const
int32x4
_t
src_wave_buffer_resource
=
const
__amdgpu_buffer_rsrc
_t
src_wave_buffer_resource
=
make_wave_buffer_resource
(
p_src_wave
,
src_element_space_size
*
sizeof
(
T
));
make_wave_buffer_resource
p_new
(
p_src_wave
,
src_element_space_size
*
sizeof
(
T
));
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
index_t
src_linear_addr_offset
=
src_linear_element_offset
*
sizeof
(
T
);
index_t
src_linear_addr_offset
=
src_linear_element_offset
*
sizeof
(
T
);
...
@@ -2302,7 +2082,7 @@ template <typename T,
...
@@ -2302,7 +2082,7 @@ template <typename T,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
,
bool
pre_nop
=
false
>
bool
pre_nop
=
false
>
CK_TILE_DEVICE
void
amd_async_buffer_load_with_oob_raw
(
T
*
smem
,
CK_TILE_DEVICE
void
amd_async_buffer_load_with_oob_raw
(
T
*
smem
,
const
int32x4
_t
src_wave_buffer_resource
,
const
__amdgpu_buffer_rsrc
_t
src_wave_buffer_resource
,
index_t
src_thread_element_offset
,
index_t
src_thread_element_offset
,
index_t
src_linear_element_offset
,
index_t
src_linear_element_offset
,
bool_constant
<
pre_nop
>
=
{})
bool_constant
<
pre_nop
>
=
{})
...
@@ -2324,7 +2104,7 @@ template <typename T,
...
@@ -2324,7 +2104,7 @@ template <typename T,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
,
bool
oob_conditional_check
=
false
>
bool
oob_conditional_check
=
false
>
CK_TILE_DEVICE
void
amd_async_buffer_load_with_oob
(
CK_TILE_LDS_ADDR
T
*
smem
,
CK_TILE_DEVICE
void
amd_async_buffer_load_with_oob
(
CK_TILE_LDS_ADDR
T
*
smem
,
const
int32x4
_t
src_wave_buffer_resource
,
const
__amdgpu_buffer_rsrc
_t
src_wave_buffer_resource
,
index_t
src_thread_element_offset
,
index_t
src_thread_element_offset
,
index_t
src_linear_element_offset
,
index_t
src_linear_element_offset
,
bool
is_valid_element
,
bool
is_valid_element
,
...
@@ -2356,8 +2136,8 @@ CK_TILE_DEVICE void amd_buffer_store(const thread_buffer<T, N>& src_thread_data,
...
@@ -2356,8 +2136,8 @@ CK_TILE_DEVICE void amd_buffer_store(const thread_buffer<T, N>& src_thread_data,
const
bool
dst_thread_element_valid
,
const
bool
dst_thread_element_valid
,
const
index_t
dst_element_space_size
)
const
index_t
dst_element_space_size
)
{
{
const
int32x4
_t
dst_wave_buffer_resource
=
const
__amdgpu_buffer_rsrc
_t
dst_wave_buffer_resource
=
make_wave_buffer_resource
(
p_dst_wave
,
dst_element_space_size
*
sizeof
(
T
));
make_wave_buffer_resource
_new
(
p_dst_wave
,
dst_element_space_size
*
sizeof
(
T
));
index_t
dst_thread_addr_offset
=
dst_thread_element_offset
*
sizeof
(
T
);
index_t
dst_thread_addr_offset
=
dst_thread_element_offset
*
sizeof
(
T
);
...
@@ -2398,8 +2178,8 @@ CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_d
...
@@ -2398,8 +2178,8 @@ CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_d
const
bool
dst_thread_element_valid
,
const
bool
dst_thread_element_valid
,
const
index_t
dst_element_space_size
)
const
index_t
dst_element_space_size
)
{
{
const
int32x4
_t
dst_wave_buffer_resource
=
const
__amdgpu_buffer_rsrc
_t
dst_wave_buffer_resource
=
make_wave_buffer_resource
(
p_dst_wave
,
dst_element_space_size
*
sizeof
(
T
));
make_wave_buffer_resource
_new
(
p_dst_wave
,
dst_element_space_size
*
sizeof
(
T
));
index_t
dst_thread_addr_offset
=
dst_thread_element_offset
*
sizeof
(
T
);
index_t
dst_thread_addr_offset
=
dst_thread_element_offset
*
sizeof
(
T
);
index_t
dst_linear_addr_offset
=
dst_linear_element_offset
*
sizeof
(
T
);
index_t
dst_linear_addr_offset
=
dst_linear_element_offset
*
sizeof
(
T
);
...
@@ -2550,4 +2330,4 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
...
@@ -2550,4 +2330,4 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
#endif
#endif
}
}
}
// namespace ck_tile
}
// namespace ck_tile
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment