Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
67204577
"vscode:/vscode.git/clone" did not exist on "5ba9ea8698c8c2847f1366139d46edfbb39d743b"
Commit
67204577
authored
Feb 11, 2025
by
illsilin
Browse files
fix clang format
parent
7567a2bd
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
189 additions
and
184 deletions
+189
-184
include/ck_tile/core/arch/amd_buffer_addressing.hpp
include/ck_tile/core/arch/amd_buffer_addressing.hpp
+189
-184
No files found.
include/ck_tile/core/arch/amd_buffer_addressing.hpp
View file @
67204577
...
...
@@ -34,10 +34,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz
return
r
;
}
CK_TILE_DEVICE
__amdgpu_buffer_rsrc_t
make_wave_buffer_resource_new
(
const
void
*
ptr
,
uint32_t
size
=
0xffffffff
)
CK_TILE_DEVICE
__amdgpu_buffer_rsrc_t
make_wave_buffer_resource_new
(
const
void
*
ptr
,
uint32_t
size
=
0xffffffff
)
{
auto
p
=
const_cast
<
remove_cv_t
<
void
>*>
(
ptr
);
auto
p
=
const_cast
<
remove_cv_t
<
void
>*>
(
ptr
);
return
__builtin_amdgcn_make_buffer_rsrc
(
p
,
0
,
size
,
CK_TILE_BUFFER_RESOURCE_3RD_DWORD
);
}
...
...
@@ -906,12 +906,12 @@ CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(
index_t
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32.v4i32"
);
// buffer atomic-max fp64
CK_TILE_DEVICE_EXTERN
double
llvm_amdgcn_raw_buffer_atomic_max_fp64
(
double
vdata
,
int32x4_t
rsrc
,
// dst_wave_buffer_resource
int
voffset
,
// dst_thread_addr_offset
int
soffset
,
// dst_wave_addr_offset
int
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32"
);
CK_TILE_DEVICE_EXTERN
double
llvm_amdgcn_raw_buffer_atomic_max_fp64
(
double
vdata
,
int32x4_t
rsrc
,
// dst_wave_buffer_resource
int
voffset
,
// dst_thread_addr_offset
int
soffset
,
// dst_wave_addr_offset
int
glc_slc
)
__asm
(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64.v4i32"
);
// Direct loads from global to LDS.
CK_TILE_DEVICE_EXTERN
void
...
...
@@ -977,58 +977,59 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
if
constexpr
(
N
==
1
)
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b8
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b8
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
2
)
{
int16_t
tmp
=
__builtin_amdgcn_raw_buffer_load_b16
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
return
bit_cast
<
rtn_type
>
(
tmp
);
}
else
if
constexpr
(
N
==
4
)
{
int32_t
tmp
=
__builtin_amdgcn_raw_buffer_load_b32
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
return
bit_cast
<
rtn_type
>
(
tmp
);
}
else
if
constexpr
(
N
==
8
)
{
int32x2_t
tmp
=
__builtin_amdgcn_raw_buffer_load_b64
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
return
bit_cast
<
rtn_type
>
(
tmp
);
}
else
if
constexpr
(
N
==
16
)
{
int32x4_t
tmp
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
return
bit_cast
<
rtn_type
>
(
tmp
);
}
else
if
constexpr
(
N
==
32
)
{
int32x4_t
tmp0
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
int32x4_t
tmp1
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
thread_buffer
<
int32_t
,
8
>
tmp
;
tmp
.
template
get_as
<
int32x4_t
>()(
number
<
0
>
{})
=
tmp0
;
...
...
@@ -1039,24 +1040,24 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
else
if
constexpr
(
N
==
64
)
{
int32x4_t
tmp0
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
int32x4_t
tmp1
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
int32x4_t
tmp2
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
8
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
+
8
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
int32x4_t
tmp3
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
12
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
+
12
*
sizeof
(
int32_t
),
static_cast
<
index_t
>
(
coherence
));
thread_buffer
<
int32_t
,
16
>
tmp
;
...
...
@@ -1076,9 +1077,10 @@ amd_buffer_load_impl_with_bytes(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
template
<
typename
T
,
index_t
N
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
CK_TILE_DEVICE
thread_buffer
<
T
,
N
>
amd_buffer_load_impl
(
__amdgpu_buffer_rsrc_t
src_wave_buffer_resource
,
index_t
src_thread_addr_offset
,
index_t
src_wave_addr_offset
)
CK_TILE_DEVICE
thread_buffer
<
T
,
N
>
amd_buffer_load_impl
(
__amdgpu_buffer_rsrc_t
src_wave_buffer_resource
,
index_t
src_thread_addr_offset
,
index_t
src_wave_addr_offset
)
{
static_assert
(
(
std
::
is_same
<
T
,
double
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
...
...
@@ -1100,25 +1102,25 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b32
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
2
)
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b64
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
4
)
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
8
)
{
...
...
@@ -1126,15 +1128,15 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
0
>
{})
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
1
>
{})
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
return
tmp
;
}
...
...
@@ -1144,27 +1146,27 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
0
>
{})
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
1
>
{})
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
+
4
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
2
>
{})
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
8
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
+
8
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
tmp
.
template
get_as
<
fp32x4_t
>()(
number
<
3
>
{})
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
+
12
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
+
12
*
sizeof
(
float
),
static_cast
<
index_t
>
(
coherence
));
return
tmp
;
}
...
...
@@ -1175,33 +1177,33 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b16
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
2
)
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b32
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
4
)
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b64
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
8
)
{
// use fp32 load to mimic fp16 load
fp32x4_t
tmp
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
return
bit_cast
<
rtn_type
>
(
tmp
);
}
...
...
@@ -1212,32 +1214,32 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(__amdgpu_buffer_rsrc_t s
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b16
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
2
)
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b32
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
4
)
{
return
bit_cast
<
rtn_type
>
(
__builtin_amdgcn_raw_buffer_load_b64
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
)));
}
else
if
constexpr
(
N
==
8
)
{
int32x4_t
tmp
=
__builtin_amdgcn_raw_buffer_load_b128
(
src_wave_buffer_resource
,
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
src_thread_addr_offset
,
src_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
return
bit_cast
<
rtn_type
>
(
tmp
);
}
...
...
@@ -1352,10 +1354,11 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
template
<
index_t
N
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
>
CK_TILE_DEVICE
void
amd_buffer_store_impl_with_bytes
(
const
thread_buffer
<
int8_t
,
N
>
src_thread_data
,
__amdgpu_buffer_rsrc_t
dst_wave_buffer_resource
,
index_t
dst_thread_addr_offset
,
index_t
dst_wave_addr_offset
)
CK_TILE_DEVICE
void
amd_buffer_store_impl_with_bytes
(
const
thread_buffer
<
int8_t
,
N
>
src_thread_data
,
__amdgpu_buffer_rsrc_t
dst_wave_buffer_resource
,
index_t
dst_thread_addr_offset
,
index_t
dst_wave_addr_offset
)
{
static_assert
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
||
N
==
32
||
N
==
64
,
"wrong! not implemented"
);
...
...
@@ -1363,43 +1366,43 @@ CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer<int8_t,
if
constexpr
(
N
==
1
)
{
__builtin_amdgcn_raw_buffer_store_b8
(
bit_cast
<
int8_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
2
)
{
__builtin_amdgcn_raw_buffer_store_b16
(
bit_cast
<
int16_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
4
)
{
__builtin_amdgcn_raw_buffer_store_b32
(
bit_cast
<
int32_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
8
)
{
__builtin_amdgcn_raw_buffer_store_b64
(
bit_cast
<
int32x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
16
)
{
__builtin_amdgcn_raw_buffer_store_b128
(
bit_cast
<
int32x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
32
)
{
...
...
@@ -1477,26 +1480,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if
constexpr
(
N
==
1
)
{
__builtin_amdgcn_raw_buffer_store_b32
(
bit_cast
<
float
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
2
)
{
__builtin_amdgcn_raw_buffer_store_b64
(
bit_cast
<
fp32x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
4
)
{
__builtin_amdgcn_raw_buffer_store_b128
(
bit_cast
<
fp32x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
8
)
{
...
...
@@ -1519,26 +1522,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if
constexpr
(
N
==
1
)
{
__builtin_amdgcn_raw_buffer_store_b16
(
bit_cast
<
_Float16
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
2
)
{
__builtin_amdgcn_raw_buffer_store_b32
(
bit_cast
<
fp16x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
4
)
{
__builtin_amdgcn_raw_buffer_store_b64
(
bit_cast
<
fp16x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
8
)
{
...
...
@@ -1558,10 +1561,10 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
static_cast<index_t>(coherence));
#else
__builtin_amdgcn_raw_buffer_store_b128
(
bit_cast
<
fp32x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
#endif
}
}
...
...
@@ -1570,26 +1573,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if
constexpr
(
N
==
1
)
{
__builtin_amdgcn_raw_buffer_store_b16
(
bit_cast
<
int16_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
2
)
{
__builtin_amdgcn_raw_buffer_store_b32
(
bit_cast
<
int16x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
4
)
{
__builtin_amdgcn_raw_buffer_store_b64
(
bit_cast
<
int16x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
8
)
{
...
...
@@ -1613,26 +1616,26 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
if
constexpr
(
N
==
1
)
{
__builtin_amdgcn_raw_buffer_store_b16
(
bit_cast
<
uint16_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
2
)
{
__builtin_amdgcn_raw_buffer_store_b32
(
bit_cast
<
uint16x2_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
4
)
{
__builtin_amdgcn_raw_buffer_store_b64
(
bit_cast
<
uint16x4_t
>
(
src_thread_data
),
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
dst_wave_addr_offset
,
static_cast
<
index_t
>
(
coherence
));
}
else
if
constexpr
(
N
==
8
)
{
...
...
@@ -2081,11 +2084,12 @@ template <typename T,
index_t
N
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
,
bool
pre_nop
=
false
>
CK_TILE_DEVICE
void
amd_async_buffer_load_with_oob_raw
(
T
*
smem
,
const
__amdgpu_buffer_rsrc_t
src_wave_buffer_resource
,
index_t
src_thread_element_offset
,
index_t
src_linear_element_offset
,
bool_constant
<
pre_nop
>
=
{})
CK_TILE_DEVICE
void
amd_async_buffer_load_with_oob_raw
(
T
*
smem
,
const
__amdgpu_buffer_rsrc_t
src_wave_buffer_resource
,
index_t
src_thread_element_offset
,
index_t
src_linear_element_offset
,
bool_constant
<
pre_nop
>
=
{})
{
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
index_t
src_linear_addr_offset
=
src_linear_element_offset
*
sizeof
(
T
);
...
...
@@ -2103,12 +2107,13 @@ template <typename T,
index_t
N
,
amd_buffer_coherence_enum
coherence
=
amd_buffer_coherence_enum
::
coherence_default
,
bool
oob_conditional_check
=
false
>
CK_TILE_DEVICE
void
amd_async_buffer_load_with_oob
(
CK_TILE_LDS_ADDR
T
*
smem
,
const
__amdgpu_buffer_rsrc_t
src_wave_buffer_resource
,
index_t
src_thread_element_offset
,
index_t
src_linear_element_offset
,
bool
is_valid_element
,
bool_constant
<
oob_conditional_check
>
=
{})
CK_TILE_DEVICE
void
amd_async_buffer_load_with_oob
(
CK_TILE_LDS_ADDR
T
*
smem
,
const
__amdgpu_buffer_rsrc_t
src_wave_buffer_resource
,
index_t
src_thread_element_offset
,
index_t
src_linear_element_offset
,
bool
is_valid_element
,
bool_constant
<
oob_conditional_check
>
=
{})
{
index_t
src_thread_addr_offset
=
src_thread_element_offset
*
sizeof
(
T
);
index_t
src_linear_addr_offset
=
src_linear_element_offset
*
sizeof
(
T
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment