Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
2a768425
Commit
2a768425
authored
Dec 20, 2024
by
Mateusz Ozga
Browse files
Merge remote-tracking branch 'origin/develop' into mozga-amd/universal_gemm_weight
parents
f358055f
1c45ca35
Changes
58
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3431 additions
and
2082 deletions
+3431
-2082
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
...operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+1
-0
include/ck/utility/amd_ck_fp8.hpp
include/ck/utility/amd_ck_fp8.hpp
+18
-6
include/ck_tile/core/arch/amd_buffer_addressing.hpp
include/ck_tile/core/arch/amd_buffer_addressing.hpp
+2
-2
include/ck_tile/core/container/meta_data_buffer.hpp
include/ck_tile/core/container/meta_data_buffer.hpp
+3
-3
include/ck_tile/core/tensor/static_distributed_tensor.hpp
include/ck_tile/core/tensor/static_distributed_tensor.hpp
+1
-0
include/ck_tile/host/arg_parser.hpp
include/ck_tile/host/arg_parser.hpp
+44
-2
include/ck_tile/host/reference/reference_gemm.hpp
include/ck_tile/host/reference/reference_gemm.hpp
+11
-151
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
.../block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
+794
-589
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
...ck/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
+750
-689
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
...tmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
+504
-503
include/ck_tile/ops/fmha.hpp
include/ck_tile/ops/fmha.hpp
+2
-0
include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+4
-2
include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
..._tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+37
-19
include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+6
-3
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
...fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
+65
-18
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp
...lock_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp
+126
-47
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
...ock_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
+794
-0
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp
...litkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp
+226
-0
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
...ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+29
-7
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
...a/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+14
-41
No files found.
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
View file @
2a768425
...
...
@@ -729,6 +729,7 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
return
str
.
str
();
}
REGISTER_EXTRA_PRINTING_METHODS
};
}
// namespace device
...
...
include/ck/utility/amd_ck_fp8.hpp
View file @
2a768425
...
...
@@ -18,6 +18,20 @@
#define CK_USE_OCP_FP8 0
#endif
namespace
{
// https://en.cppreference.com/w/cpp/types/conditional
template
<
bool
B
,
class
T
,
class
F
>
struct
conditional
{
using
type
=
T
;
};
template
<
class
T
,
class
F
>
struct
conditional
<
false
,
T
,
F
>
{
using
type
=
F
;
};
}
// namespace
namespace
ck
{
using
f8_fnuz_t
=
_BitInt
(
8
);
...
...
@@ -191,11 +205,10 @@ __host__ __device__ static inline T cast_from_f8(fp8_storage_t x)
}
}
typename
__hip_internal
::
conditional
<
typename
conditional
<
sizeof
(
T
)
==
2
,
unsigned
short
int
,
typename
__hip_internal
::
conditional
<
sizeof
(
T
)
==
4
,
unsigned
int
,
unsigned
long
long
>::
type
>::
type
retval
;
typename
conditional
<
sizeof
(
T
)
==
4
,
unsigned
int
,
unsigned
long
long
>::
type
>::
type
retval
;
if
constexpr
(
we
==
5
&&
is_half
&&
!
is_fnuz
)
{
...
...
@@ -538,11 +551,10 @@ __host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rn
constexpr
int
mfmt
=
(
sizeof
(
T
)
==
8
)
?
52
:
((
sizeof
(
T
)
==
4
)
?
23
:
10
);
using
T_bitwise
=
typename
__hip_internal
::
conditional
<
using
T_bitwise
=
typename
conditional
<
sizeof
(
T
)
==
2
,
unsigned
short
int
,
typename
__hip_internal
::
conditional
<
sizeof
(
T
)
==
4
,
unsigned
int
,
unsigned
long
long
>::
type
>::
type
;
typename
conditional
<
sizeof
(
T
)
==
4
,
unsigned
int
,
unsigned
long
long
>::
type
>::
type
;
T_bitwise
x_bitwise
=
bit_cast
<
T_bitwise
>
(
_x
);
unsigned
long
long
x
{
x_bitwise
};
...
...
include/ck_tile/core/arch/amd_buffer_addressing.hpp
View file @
2a768425
...
...
@@ -1303,8 +1303,8 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
static_assert
(
(
std
::
is_same
<
T
,
double
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
std
::
is_same
<
T
,
float
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
std
::
is_same
<
T
,
fp16_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
std
::
is_same
<
T
,
bf16_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
std
::
is_same
<
T
,
fp16_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
std
::
is_same
<
T
,
bf16_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
))
||
(
std
::
is_same
<
T
,
int32_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
(
std
::
is_same
<
T
,
fp8_t
>::
value
&&
(
N
==
1
||
N
==
2
||
N
==
4
||
N
==
8
||
N
==
16
))
||
...
...
include/ck_tile/core/container/meta_data_buffer.hpp
View file @
2a768425
...
...
@@ -30,7 +30,7 @@ struct meta_data_buffer
{
constexpr
index_t
size
=
sizeof
(
T
);
auto
tmp
=
bit_cast
<
array
<
std
::
byte
,
size
>>
(
data
);
auto
tmp
=
ck_tile
::
bit_cast
<
array
<
std
::
byte
,
size
>>
(
data
);
for
(
int
i
=
0
;
i
<
size
;
i
++
)
{
...
...
@@ -66,7 +66,7 @@ struct meta_data_buffer
pos
++
;
}
data
=
bit_cast
<
T
>
(
tmp
);
data
=
ck_tile
::
bit_cast
<
T
>
(
tmp
);
}
return
data
;
...
...
@@ -86,7 +86,7 @@ struct meta_data_buffer
pos
++
;
}
auto
data
=
bit_cast
<
T
>
(
tmp
);
auto
data
=
ck_tile
::
bit_cast
<
T
>
(
tmp
);
return
data
;
}
...
...
include/ck_tile/core/tensor/static_distributed_tensor.hpp
View file @
2a768425
...
...
@@ -29,6 +29,7 @@ struct static_distributed_tensor
remove_cvref_t
<
decltype
(
StaticTileDistribution
{}.
get_ys_to_d_descriptor
())
>
;
static
constexpr
index_t
kThreadElementSpaceSize
=
ThreadTensorDesc
{}.
get_element_space_size
();
static_assert
(
0
<
kThreadElementSpaceSize
,
"Make sure tile distribution is valid"
);
CK_TILE_HOST_DEVICE
static
constexpr
auto
get_num_of_dimension
()
{
...
...
include/ck_tile/host/arg_parser.hpp
View file @
2a768425
...
...
@@ -15,11 +15,14 @@
namespace
ck_tile
{
/*
* a host side utility, arg parser for
* -[key0]=[value0] -[key1]=[value1] ...
* a host side utility, arg parser for, either
* -[key0] = [value0, value1, value2]
* or
* -[key0]=[value0] -[key1]=[value1] ...
*/
class
ArgParser
{
public:
class
Arg
{
...
...
@@ -187,6 +190,45 @@ class ArgParser
return
value
;
}
std
::
vector
<
std
::
string
>
get_string_vec
(
const
std
::
string
&
name
,
const
std
::
string
&
delimiter
=
","
)
const
{
if
(
get_str
(
name
).
empty
())
{
return
{};
}
std
::
string
s
=
get_str
(
name
);
std
::
vector
<
std
::
string
>
tokens
;
size_t
pos
=
0
;
std
::
string
token
;
while
((
pos
=
s
.
find
(
delimiter
))
!=
std
::
string
::
npos
)
{
token
=
s
.
substr
(
0
,
pos
);
tokens
.
push_back
(
token
);
s
.
erase
(
0
,
pos
+
delimiter
.
length
());
}
tokens
.
push_back
(
s
);
return
tokens
;
}
std
::
vector
<
int
>
get_int_vec
(
const
std
::
string
&
name
,
const
std
::
string
&
delimiter
=
","
)
const
{
if
(
get_str
(
name
).
empty
())
{
return
{};
}
const
std
::
vector
<
std
::
string
>
args
=
get_string_vec
(
name
,
delimiter
);
std
::
vector
<
int
>
tokens
;
tokens
.
reserve
(
static_cast
<
int
>
(
args
.
size
()));
for
(
const
std
::
string
&
token
:
args
)
{
int
value
=
atoi
(
token
.
c_str
());
tokens
.
push_back
(
value
);
}
return
tokens
;
}
private:
std
::
unordered_map
<
std
::
string
,
Arg
>
input_map
;
std
::
vector
<
std
::
string
>
keys
;
...
...
include/ck_tile/host/reference/reference_gemm.hpp
View file @
2a768425
...
...
@@ -97,9 +97,9 @@ template <typename ADataType,
typename
LayoutA
,
typename
LayoutB
,
typename
LayoutC
>
void
reference_gemm_gpu
(
DeviceMem
&
a_device
,
DeviceMem
&
b_device
,
DeviceMem
&
c_device
,
void
reference_gemm_gpu
(
ADataType
*
a_ptr
,
BDataType
*
b_ptr
,
CDataType
*
c_ptr
,
index_t
M
,
index_t
N
,
index_t
K
,
...
...
@@ -107,79 +107,13 @@ void reference_gemm_gpu(DeviceMem& a_device,
index_t
stride_b
,
index_t
stride_c
)
{
ADataType
*
d_A
;
BDataType
*
d_B
;
CDataType
*
d_C
;
hipError_t
errA
=
hipMalloc
(
&
d_A
,
M
*
K
*
sizeof
(
ADataType
));
hipError_t
errB
=
hipMalloc
(
&
d_B
,
N
*
K
*
sizeof
(
BDataType
));
hipError_t
errC
=
hipMalloc
(
&
d_C
,
M
*
N
*
sizeof
(
CDataType
));
if
(
errA
!=
hipSuccess
)
{
std
::
cerr
<<
"Error allocating device memory for A: "
<<
hipGetErrorString
(
errA
)
<<
std
::
endl
;
return
;
// Early exit on error
}
if
(
errB
!=
hipSuccess
)
{
std
::
cerr
<<
"Error allocating device memory for B: "
<<
hipGetErrorString
(
errB
)
<<
std
::
endl
;
return
;
// Early exit on error
}
if
(
errC
!=
hipSuccess
)
{
std
::
cerr
<<
"Error allocating device memory for C: "
<<
hipGetErrorString
(
errC
)
<<
std
::
endl
;
return
;
// Early exit on error
}
errA
=
hipMemcpy
(
d_A
,
a_device
.
GetDeviceBuffer
(),
M
*
K
*
sizeof
(
ADataType
),
hipMemcpyHostToDevice
);
if
(
errA
!=
hipSuccess
)
{
std
::
cerr
<<
"Error copying A to device: "
<<
hipGetErrorString
(
errA
)
<<
std
::
endl
;
}
errB
=
hipMemcpy
(
d_B
,
b_device
.
GetDeviceBuffer
(),
N
*
K
*
sizeof
(
BDataType
),
hipMemcpyHostToDevice
);
if
(
errB
!=
hipSuccess
)
{
std
::
cerr
<<
"Error copying B to device: "
<<
hipGetErrorString
(
errB
)
<<
std
::
endl
;
}
int
totalElements
=
M
*
N
;
int
numThreadsPerBlock
=
256
;
// Common choice for threads per block
int
numBlocks
=
(
totalElements
+
numThreadsPerBlock
-
1
)
/
numThreadsPerBlock
;
naive_gemm_kernel
<
ADataType
,
BDataType
,
AccDataType
,
CDataType
,
LayoutA
,
LayoutB
,
LayoutC
>
<<<
numBlocks
,
numThreadsPerBlock
>>>
(
d_A
,
d_B
,
d_C
,
M
,
N
,
K
,
stride_a
,
stride_b
,
stride_c
);
errC
=
hipMemcpy
(
c_device
.
GetDeviceBuffer
(),
d_C
,
M
*
N
*
sizeof
(
CDataType
),
hipMemcpyDeviceToHost
);
if
(
errC
!=
hipSuccess
)
{
std
::
cerr
<<
"Error copying C to device: "
<<
hipGetErrorString
(
errC
)
<<
std
::
endl
;
}
errA
=
hipFree
(
d_A
);
if
(
errA
!=
hipSuccess
)
{
std
::
cerr
<<
"Error free the A memory: "
<<
hipGetErrorString
(
errA
)
<<
std
::
endl
;
}
errB
=
hipFree
(
d_B
);
if
(
errB
!=
hipSuccess
)
{
std
::
cerr
<<
"Error free the B memory: "
<<
hipGetErrorString
(
errB
)
<<
std
::
endl
;
}
errC
=
hipFree
(
d_C
);
if
(
errC
!=
hipSuccess
)
{
std
::
cerr
<<
"Error free the C memory: "
<<
hipGetErrorString
(
errC
)
<<
std
::
endl
;
}
<<<
numBlocks
,
numThreadsPerBlock
>>>
(
a_ptr
,
b_ptr
,
c_ptr
,
M
,
N
,
K
,
stride_a
,
stride_b
,
stride_c
);
return
;
}
...
...
@@ -191,9 +125,9 @@ template <typename ADataType,
typename
LayoutA
,
typename
LayoutB
,
typename
LayoutC
>
void
reference_batched_gemm_gpu
(
DeviceMem
&
a_device
,
DeviceMem
&
b_device
,
DeviceMem
&
c_device
,
void
reference_batched_gemm_gpu
(
ADataType
*
a_ptr
,
BDataType
*
b_ptr
,
CDataType
*
c_ptr
,
index_t
M
,
index_t
N
,
index_t
K
,
...
...
@@ -205,94 +139,20 @@ void reference_batched_gemm_gpu(DeviceMem& a_device,
index_t
batch_stride_C
,
index_t
batch_count
)
{
ADataType
*
d_A
;
BDataType
*
d_B
;
CDataType
*
d_C
;
hipError_t
errA
=
hipMalloc
(
&
d_A
,
batch_count
*
M
*
K
*
sizeof
(
ADataType
));
hipError_t
errB
=
hipMalloc
(
&
d_B
,
batch_count
*
N
*
K
*
sizeof
(
BDataType
));
hipError_t
errC
=
hipMalloc
(
&
d_C
,
batch_count
*
M
*
N
*
sizeof
(
CDataType
));
if
(
errA
!=
hipSuccess
)
{
std
::
cerr
<<
"Error allocating device memory for A: "
<<
hipGetErrorString
(
errA
)
<<
std
::
endl
;
return
;
// Early exit on error
}
if
(
errB
!=
hipSuccess
)
{
std
::
cerr
<<
"Error allocating device memory for B: "
<<
hipGetErrorString
(
errB
)
<<
std
::
endl
;
return
;
// Early exit on error
}
if
(
errC
!=
hipSuccess
)
{
std
::
cerr
<<
"Error allocating device memory for C: "
<<
hipGetErrorString
(
errC
)
<<
std
::
endl
;
return
;
// Early exit on error
}
errA
=
hipMemcpy
(
d_A
,
a_device
.
GetDeviceBuffer
(),
batch_count
*
M
*
K
*
sizeof
(
ADataType
),
hipMemcpyHostToDevice
);
if
(
errA
!=
hipSuccess
)
{
std
::
cerr
<<
"Error copying A to device: "
<<
hipGetErrorString
(
errA
)
<<
std
::
endl
;
}
errB
=
hipMemcpy
(
d_B
,
b_device
.
GetDeviceBuffer
(),
batch_count
*
N
*
K
*
sizeof
(
BDataType
),
hipMemcpyHostToDevice
);
if
(
errB
!=
hipSuccess
)
{
std
::
cerr
<<
"Error copying B to device: "
<<
hipGetErrorString
(
errB
)
<<
std
::
endl
;
}
int
totalElements
=
M
*
N
;
int
numThreadsPerBlock
=
256
;
// Common choice for threads per block
int
numBlocks
=
(
totalElements
+
numThreadsPerBlock
-
1
)
/
numThreadsPerBlock
;
for
(
index_t
batch_id
=
0
;
batch_id
<
batch_count
;
++
batch_id
)
{
ADataType
*
d_ATemp
=
d_A
+
batch_id
*
batch_stride_A
;
BDataType
*
d_BTemp
=
d_B
+
batch_id
*
batch_stride_B
;
CDataType
*
d_CTemp
=
d_C
+
batch_id
*
batch_stride_C
;
ADataType
*
d_ATemp
=
a_ptr
+
batch_id
*
batch_stride_A
;
BDataType
*
d_BTemp
=
b_ptr
+
batch_id
*
batch_stride_B
;
CDataType
*
d_CTemp
=
c_ptr
+
batch_id
*
batch_stride_C
;
naive_gemm_kernel
<
ADataType
,
BDataType
,
AccDataType
,
CDataType
,
LayoutA
,
LayoutB
,
LayoutC
>
<<<
numBlocks
,
numThreadsPerBlock
>>>
(
d_ATemp
,
d_BTemp
,
d_CTemp
,
M
,
N
,
K
,
stride_a
,
stride_b
,
stride_c
);
}
errC
=
hipMemcpy
(
c_device
.
GetDeviceBuffer
(),
d_C
,
batch_count
*
M
*
N
*
sizeof
(
CDataType
),
hipMemcpyDeviceToHost
);
if
(
errC
!=
hipSuccess
)
{
std
::
cerr
<<
"Error copying C to device: "
<<
hipGetErrorString
(
errC
)
<<
std
::
endl
;
}
errA
=
hipFree
(
d_A
);
if
(
errA
!=
hipSuccess
)
{
std
::
cerr
<<
"Error free the A memory: "
<<
hipGetErrorString
(
errA
)
<<
std
::
endl
;
}
errB
=
hipFree
(
d_B
);
if
(
errB
!=
hipSuccess
)
{
std
::
cerr
<<
"Error free the B memory: "
<<
hipGetErrorString
(
errB
)
<<
std
::
endl
;
}
errC
=
hipFree
(
d_C
);
if
(
errC
!=
hipSuccess
)
{
std
::
cerr
<<
"Error free the C memory: "
<<
hipGetErrorString
(
errC
)
<<
std
::
endl
;
}
return
;
}
}
// namespace ck_tile
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
View file @
2a768425
...
...
@@ -3,610 +3,815 @@
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
#
define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#
define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
#define _UK_PK_CVT_(x0_, x1_, y_)
\
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_perm_b32 "
y_
", v55, v54, s52
\n
"
#
define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
#
define _UK_PK_CVT_(x0_, x1_, y_)
\
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
#define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
" v_pack_b32_f16 "
y_
", v54, v55
\n
"
#
define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
";-------------------------------------------------------------
\n
"
" s_mov_b32 s52, 0x07060302 ; v_perm
\n
"
" s_mov_b64 s[38:39], exec ; save current exec
\n
"
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]
\n
"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]
\n
"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]
\n
"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]
\n
"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]
\n
"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]
\n
"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]
\n
"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]
\n
"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]
\n
"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]
\n
"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]
\n
"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]
\n
"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]
\n
"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]
\n
"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]
\n
"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]
\n
"
" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]
\n
"
" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]
\n
"
" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]
\n
"
" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]
\n
"
" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]
\n
"
" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]
\n
"
" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]
\n
"
" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]
\n
"
" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]
\n
"
" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]
\n
"
" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]
\n
"
" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]
\n
"
" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]
\n
"
" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]
\n
"
" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]
\n
"
" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]
\n
"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]
\n
"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]
\n
"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]
\n
"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]
\n
"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]
\n
"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]
\n
"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]
\n
"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]
\n
"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]
\n
"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]
\n
"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]
\n
"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]
\n
"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]
\n
"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]
\n
"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]
\n
"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]
\n
"
" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]
\n
"
" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]
\n
"
" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]
\n
"
" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]
\n
"
" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]
\n
"
" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]
\n
"
" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]
\n
"
" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]
\n
"
" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]
\n
"
" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]
\n
"
" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]
\n
"
" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]
\n
"
" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]
\n
"
" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]
\n
"
" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]
\n
"
" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]
\n
"
" s_waitcnt 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_waitcnt 0
\n
"
"L_start%=:
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" v_mul_f32 %[c0], %[scale_0], %[c0]
\n
"
" v_mul_f32 %[c1], %[scale_0], %[c1]
\n
"
" v_mul_f32 %[c2], %[scale_0], %[c2]
\n
"
" v_mul_f32 %[c3], %[scale_0], %[c3]
\n
"
" v_mul_f32 %[c4], %[scale_1], %[c4]
\n
"
" v_mul_f32 %[c5], %[scale_1], %[c5]
\n
"
" v_mul_f32 %[c6], %[scale_1], %[c6]
\n
"
" v_mul_f32 %[c7], %[scale_1], %[c7]
\n
"
" v_mul_f32 %[c8], %[scale_0], %[c8]
\n
"
" v_mul_f32 %[c9], %[scale_0], %[c9]
\n
"
" v_mul_f32 %[c10], %[scale_0], %[c10]
\n
"
" v_mul_f32 %[c11], %[scale_0], %[c11]
\n
"
" v_mul_f32 %[c12], %[scale_1], %[c12]
\n
"
" v_mul_f32 %[c13], %[scale_1], %[c13]
\n
"
" v_mul_f32 %[c14], %[scale_1], %[c14]
\n
"
" v_mul_f32 %[c15], %[scale_1], %[c15]
\n
"
_UK_PK_CVT_
(
"%[c0]"
,
"%[c1]"
,
"%[c0]"
)
_UK_PK_CVT_
(
"%[c2]"
,
"%[c3]"
,
"%[c1]"
)
_UK_PK_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c2]"
)
_UK_PK_CVT_
(
"%[c6]"
,
"%[c7]"
,
"%[c3]"
)
_UK_PK_CVT_
(
"%[c8]"
,
"%[c9]"
,
"%[c4]"
)
_UK_PK_CVT_
(
"%[c10]"
,
"%[c11]"
,
"%[c5]"
)
_UK_PK_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c6]"
)
_UK_PK_CVT_
(
"%[c14]"
,
"%[c15]"
,
"%[c7]"
)
" ;------------------------------
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:0 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 %[c0], %[v_sfl_sld] offset:0 + %[shfl_base]
\n
"
" ds_read_b32 %[c1], %[v_sfl_sld] offset:32 + %[shfl_base]
\n
"
" ds_read_b32 %[c2], %[v_sfl_sld] offset:64 + %[shfl_base]
\n
"
" ds_read_b32 %[c3], %[v_sfl_sld] offset:96 + %[shfl_base]
\n
"
" ds_read_b32 %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base]
\n
"
" ds_read_b32 %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base]
\n
"
" ds_read_b32 %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base]
\n
"
" ds_read_b32 %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], %[c0], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], %[c1], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], %[c2], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], %[c3], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], %[c4], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], %[c5], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], %[c6], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], %[c7], s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mul_f32 %[c16], %[scale_0], %[c16]
\n
"
" v_mul_f32 %[c17], %[scale_0], %[c17]
\n
"
" v_mul_f32 %[c18], %[scale_0], %[c18]
\n
"
" v_mul_f32 %[c19], %[scale_0], %[c19]
\n
"
" v_mul_f32 %[c20], %[scale_1], %[c20]
\n
"
" v_mul_f32 %[c21], %[scale_1], %[c21]
\n
"
" v_mul_f32 %[c22], %[scale_1], %[c22]
\n
"
" v_mul_f32 %[c23], %[scale_1], %[c23]
\n
"
" v_mul_f32 %[c24], %[scale_0], %[c24]
\n
"
" v_mul_f32 %[c25], %[scale_0], %[c25]
\n
"
" v_mul_f32 %[c26], %[scale_0], %[c26]
\n
"
" v_mul_f32 %[c27], %[scale_0], %[c27]
\n
"
" v_mul_f32 %[c28], %[scale_1], %[c28]
\n
"
" v_mul_f32 %[c29], %[scale_1], %[c29]
\n
"
" v_mul_f32 %[c30], %[scale_1], %[c30]
\n
"
" v_mul_f32 %[c31], %[scale_1], %[c31]
\n
"
" s_mov_b32 s52, 0x07060302 ; v_perm
\n
"
" s_mov_b64 s[38:39], exec ; save current exec
\n
"
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]
\n
"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]
\n
"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]
\n
"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]
\n
"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]
\n
"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]
\n
"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]
\n
"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]
\n
"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]
\n
"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]
\n
"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]
\n
"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]
\n
"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]
\n
"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]
\n
"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]
\n
"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]
\n
"
" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]
\n
"
" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]
\n
"
" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]
\n
"
" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]
\n
"
" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]
\n
"
" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]
\n
"
" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]
\n
"
" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]
\n
"
" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]
\n
"
" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]
\n
"
" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]
\n
"
" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]
\n
"
" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]
\n
"
" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]
\n
"
" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]
\n
"
" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]
\n
"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]
\n
"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]
\n
"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]
\n
"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]
\n
"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]
\n
"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]
\n
"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]
\n
"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]
\n
"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]
\n
"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]
\n
"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]
\n
"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]
\n
"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]
\n
"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]
\n
"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]
\n
"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]
\n
"
" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]
\n
"
" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]
\n
"
" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]
\n
"
" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]
\n
"
" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]
\n
"
" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]
\n
"
" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]
\n
"
" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]
\n
"
" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]
\n
"
" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]
\n
"
" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]
\n
"
" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]
\n
"
" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]
\n
"
" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]
\n
"
" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]
\n
"
" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]
\n
"
" s_waitcnt 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_waitcnt 0
\n
"
"L_start%=:
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], "
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], "
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], "
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], "
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], "
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], "
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], "
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], "
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], "
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], "
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], "
"v[206:207], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], "
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], "
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], "
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], "
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], "
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], "
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], "
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], "
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], "
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], "
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], "
"v[222:223], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], "
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], "
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], "
"%[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], "
"[%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], "
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], "
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], "
"%[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], "
"[%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], "
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], "
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[94:95], "
"v[238:239], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], "
"%[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], "
"%[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], "
"%[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], "
"%[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], "
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], "
"%[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] "
"
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], "
"%[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], "
"%[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], "
"%[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], "
"%[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], "
"%[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], "
"%[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], "
"%[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], "
"%[c15]]
\n
"
" v_mul_f32 %[c0], %[scale_0], %[c0]
\n
"
" v_mul_f32 %[c1], %[scale_0], %[c1]
\n
"
" v_mul_f32 %[c2], %[scale_0], %[c2]
\n
"
" v_mul_f32 %[c3], %[scale_0], %[c3]
\n
"
" v_mul_f32 %[c4], %[scale_1], %[c4]
\n
"
" v_mul_f32 %[c5], %[scale_1], %[c5]
\n
"
" v_mul_f32 %[c6], %[scale_1], %[c6]
\n
"
" v_mul_f32 %[c7], %[scale_1], %[c7]
\n
"
" v_mul_f32 %[c8], %[scale_0], %[c8]
\n
"
" v_mul_f32 %[c9], %[scale_0], %[c9]
\n
"
" v_mul_f32 %[c10], %[scale_0], %[c10]
\n
"
" v_mul_f32 %[c11], %[scale_0], %[c11]
\n
"
" v_mul_f32 %[c12], %[scale_1], %[c12]
\n
"
" v_mul_f32 %[c13], %[scale_1], %[c13]
\n
"
" v_mul_f32 %[c14], %[scale_1], %[c14]
\n
"
" v_mul_f32 %[c15], %[scale_1], %[c15]
\n
"
_UK_PK_CVT_
(
"%[c0]"
,
"%[c1]"
,
"%[c0]"
)
_UK_PK_CVT_
(
"%[c2]"
,
"%[c3]"
,
"%[c1]"
)
_UK_PK_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c2]"
)
_UK_PK_CVT_
(
"%[c6]"
,
"%[c7]"
,
"%[c3]"
)
_UK_PK_CVT_
(
"%[c8]"
,
"%[c9]"
,
"%[c4]"
)
_UK_PK_CVT_
(
"%[c10]"
,
"%[c11]"
,
"%[c5]"
)
_UK_PK_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c6]"
)
_UK_PK_CVT_
(
"%[c14]"
,
"%[c15]"
,
"%[c7]"
)
" ;------------------------------
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:0 + %[shfl_base] "
"
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base] "
"
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base] "
"
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base] "
"
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 %[c0], %[v_sfl_sld] offset:0 + %[shfl_base] "
"
\n
"
" ds_read_b32 %[c1], %[v_sfl_sld] offset:32 + %[shfl_base] "
"
\n
"
" ds_read_b32 %[c2], %[v_sfl_sld] offset:64 + %[shfl_base] "
"
\n
"
" ds_read_b32 %[c3], %[v_sfl_sld] offset:96 + %[shfl_base] "
"
\n
"
" ds_read_b32 %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base] "
"
\n
"
" ds_read_b32 %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base] "
"
\n
"
" ds_read_b32 %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base] "
"
\n
"
" ds_read_b32 %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base] "
"
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0] "
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], %[c0], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1] "
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], %[c1], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2] "
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], %[c2], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3] "
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], %[c3], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4] "
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], %[c4], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5] "
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], %[c5], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6] "
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], %[c6], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7] "
"
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], %[c7], s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[130:131], "
"v[130:131], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[146:147], "
"v[130:131], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[162:163], "
"v[146:147], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[178:179], "
"v[146:147], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[194:195], "
"v[162:163], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[210:211], "
"v[162:163], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[226:227], "
"v[178:179], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], "
"[%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], "
"[%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[242:243], "
"v[178:179], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], "
"[%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], "
"[%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mul_f32 %[c16], %[scale_0], %[c16]
\n
"
" v_mul_f32 %[c17], %[scale_0], %[c17]
\n
"
" v_mul_f32 %[c18], %[scale_0], %[c18]
\n
"
" v_mul_f32 %[c19], %[scale_0], %[c19]
\n
"
" v_mul_f32 %[c20], %[scale_1], %[c20]
\n
"
" v_mul_f32 %[c21], %[scale_1], %[c21]
\n
"
" v_mul_f32 %[c22], %[scale_1], %[c22]
\n
"
" v_mul_f32 %[c23], %[scale_1], %[c23]
\n
"
" v_mul_f32 %[c24], %[scale_0], %[c24]
\n
"
" v_mul_f32 %[c25], %[scale_0], %[c25]
\n
"
" v_mul_f32 %[c26], %[scale_0], %[c26]
\n
"
" v_mul_f32 %[c27], %[scale_0], %[c27]
\n
"
" v_mul_f32 %[c28], %[scale_1], %[c28]
\n
"
" v_mul_f32 %[c29], %[scale_1], %[c29]
\n
"
" v_mul_f32 %[c30], %[scale_1], %[c30]
\n
"
" v_mul_f32 %[c31], %[scale_1], %[c31]
\n
"
_UK_PK_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c1
6
]"
)
_UK_PK_CVT_
(
"%[c
18
]"
,
"%[c
19
]"
,
"%[c1
7
]"
)
_UK_PK_CVT_
(
"%[c2
0
]"
,
"%[c2
1
]"
,
"%[c
18
]"
)
_UK_PK_CVT_
(
"%[c2
2
]"
,
"%[c2
3
]"
,
"%[c1
9
]"
)
_UK_PK_CVT_
(
"%[c24]"
,
"%[c25]"
,
"%[c2
0
]"
)
_UK_PK_CVT_
(
"%[c
26
]"
,
"%[c27]"
,
"%[c21]"
)
_UK_PK_CVT_
(
"%[c28]"
,
"%[c29]"
,
"%[c
22
]"
)
_UK_PK_CVT_
(
"%[c30]"
,
"%[c31]"
,
"%[c23]"
)
_UK_PK_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c16]"
)
_UK_PK_CVT_
(
"%[c18]"
,
"%[c19]"
,
"%[c1
7
]"
)
_UK_PK_CVT_
(
"%[c20]"
,
"%[c21]"
,
"%[c18]"
)
_UK_PK_CVT_
(
"%[c
22
]"
,
"%[c
23
]"
,
"%[c1
9
]"
)
_UK_PK_CVT_
(
"%[c2
4
]"
,
"%[c2
5
]"
,
"%[c
20
]"
)
_UK_PK_CVT_
(
"%[c2
6
]"
,
"%[c2
7
]"
,
"%[c
2
1]"
)
_UK_PK_CVT_
(
"%[c28]"
,
"%[c2
9
]"
,
"%[c22]"
)
_UK_PK_CVT_
(
"%[c
30
]"
,
"%[c
31
]"
,
"%[c23]"
)
" ;------------------------------
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:0 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 %[c16], %[v_sfl_sld] offset:0 + %[shfl_base]
\n
"
" ds_read_b32 %[c17], %[v_sfl_sld] offset:32 + %[shfl_base]
\n
"
" ds_read_b32 %[c18], %[v_sfl_sld] offset:64 + %[shfl_base]
\n
"
" ds_read_b32 %[c19], %[v_sfl_sld] offset:96 + %[shfl_base]
\n
"
" ds_read_b32 %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base]
\n
"
" ds_read_b32 %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base]
\n
"
" ds_read_b32 %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base]
\n
"
" ds_read_b32 %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], %[c16], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], %[c17], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], %[c18], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], %[c19], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], %[c20], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], %[c21], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], %[c22], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], %[c23], s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_branch L_start%=
\n
"
"L_end%=:
\n
"
" ;------------------------------
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:0 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 %[c16], %[v_sfl_sld] offset:0 + %[shfl_base]
\n
"
" ds_read_b32 %[c17], %[v_sfl_sld] offset:32 + %[shfl_base]
\n
"
" ds_read_b32 %[c18], %[v_sfl_sld] offset:64 + %[shfl_base]
\n
"
" ds_read_b32 %[c19], %[v_sfl_sld] offset:96 + %[shfl_base]
\n
"
" ds_read_b32 %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base]
\n
"
" ds_read_b32 %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base]
\n
"
" ds_read_b32 %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base]
\n
"
" ds_read_b32 %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], %[c16], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], %[c17], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], %[c18], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], %[c19], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], %[c20], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], %[c21], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], %[c22], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], %[c23], s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_branch L_start%=
\n
"
"L_end%=:
\n
"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
...
...
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc
View file @
2a768425
...
...
@@ -3,706 +3,767 @@
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
#
define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#
define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_perm_b32 "
y_
", v55, v54, s52
\n
"
#define _UK_PK_CVT_(x0_, x1_, y_)
\
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_perm_b32 "
y_
", v55, v54, s52
\n
"
#
define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
#
define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
" v_pack_b32_f16 "
y_
", v54, v55
\n
"
#define _UK_PK_CVT_(x0_, x1_, y_)
\
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
" v_pack_b32_f16 "
y_
", v54, v55
\n
"
#
define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
";-------------------------------------------------------------
\n
"
" s_mov_b32 s52, 0x07060302 ; v_perm
\n
"
" s_mov_b64 s[38:39], exec ; save current exec
\n
"
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" s_mov_b32 s59, 0
\n
"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]
\n
"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]
\n
"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]
\n
"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]
\n
"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]
\n
"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]
\n
"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]
\n
"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]
\n
"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]
\n
"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]
\n
"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]
\n
"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]
\n
"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]
\n
"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]
\n
"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]
\n
"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]
\n
"
" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]
\n
"
" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]
\n
"
" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]
\n
"
" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]
\n
"
" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]
\n
"
" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]
\n
"
" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]
\n
"
" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]
\n
"
" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]
\n
"
" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]
\n
"
" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]
\n
"
" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]
\n
"
" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]
\n
"
" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]
\n
"
" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]
\n
"
" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]
\n
"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]
\n
"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]
\n
"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]
\n
"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]
\n
"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]
\n
"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]
\n
"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]
\n
"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]
\n
"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]
\n
"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]
\n
"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]
\n
"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]
\n
"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]
\n
"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]
\n
"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]
\n
"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]
\n
"
" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]
\n
"
" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]
\n
"
" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]
\n
"
" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]
\n
"
" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]
\n
"
" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]
\n
"
" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]
\n
"
" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]
\n
"
" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]
\n
"
" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]
\n
"
" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]
\n
"
" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]
\n
"
" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]
\n
"
" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]
\n
"
" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]
\n
"
" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]
\n
"
" s_waitcnt 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_mov_b32 v64, 0
\n
"
" v_mov_b32 v80, 0
\n
"
" v_mov_b32 v65, 0
\n
"
" v_mov_b32 v81, 0
\n
"
" v_mov_b32 v66, 0
\n
"
" v_mov_b32 v82, 0
\n
"
" v_mov_b32 v67, 0
\n
"
" v_mov_b32 v83, 0
\n
"
" v_mov_b32 v68, 0
\n
"
" v_mov_b32 v84, 0
\n
"
" v_mov_b32 v69, 0
\n
"
" v_mov_b32 v85, 0
\n
"
" v_mov_b32 v70, 0
\n
"
" v_mov_b32 v86, 0
\n
"
" v_mov_b32 v71, 0
\n
"
" v_mov_b32 v87, 0
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168
\n
"
" s_mov_b32 s80, 0
\n
"
" s_waitcnt vmcnt(24)
\n
"
"label_0AA6:
\n
"
" s_waitcnt vmcnt(30) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67]
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s56, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s56, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_cmp_ge_u32 s80, 0x00000100
\n
"
" s_cselect_b32 s59, %[s_tile_os_o], s59
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" v_mul_f32 %[c0], %[scale_0], %[c0]
\n
"
" v_mul_f32 %[c1], %[scale_0], %[c1]
\n
"
" v_mul_f32 %[c2], %[scale_0], %[c2]
\n
"
" v_mul_f32 %[c3], %[scale_0], %[c3]
\n
"
" v_mul_f32 %[c4], %[scale_1], %[c4]
\n
"
" v_mul_f32 %[c5], %[scale_1], %[c5]
\n
"
" v_mul_f32 %[c6], %[scale_1], %[c6]
\n
"
" v_mul_f32 %[c7], %[scale_1], %[c7]
\n
"
" v_mul_f32 %[c8], %[scale_0], %[c8]
\n
"
" v_mul_f32 %[c9], %[scale_0], %[c9]
\n
"
" v_mul_f32 %[c10], %[scale_0], %[c10]
\n
"
" v_mul_f32 %[c11], %[scale_0], %[c11]
\n
"
" v_mul_f32 %[c12], %[scale_1], %[c12]
\n
"
" v_mul_f32 %[c13], %[scale_1], %[c13]
\n
"
" v_mul_f32 %[c14], %[scale_1], %[c14]
\n
"
" v_mul_f32 %[c15], %[scale_1], %[c15]
\n
"
_UK_PK_CVT_
(
"%[c0]"
,
"%[c1]"
,
"%[c0]"
)
_UK_PK_CVT_
(
"%[c2]"
,
"%[c3]"
,
"%[c1]"
)
_UK_PK_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c2]"
)
_UK_PK_CVT_
(
"%[c6]"
,
"%[c7]"
,
"%[c3]"
)
_UK_PK_CVT_
(
"%[c8]"
,
"%[c9]"
,
"%[c4]"
)
_UK_PK_CVT_
(
"%[c10]"
,
"%[c11]"
,
"%[c5]"
)
_UK_PK_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c6]"
)
_UK_PK_CVT_
(
"%[c14]"
,
"%[c15]"
,
"%[c7]"
)
" s_addk_i32 s80, 0x0080
\n
"
" s_cmp_lt_i32 s80, %[s_loop_cnt]
\n
"
" s_cbranch_scc0 label_0EC1
\n
"
" s_waitcnt vmcnt(30) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_write_b64 v3, v[64:65] offset:16640
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64 v3, v[66:67] offset:20992
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_write_b64 v3, v[68:69] offset:18816
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]
\n
"
" ds_write_b64 v3, v[70:71] offset:23168
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83]
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[236:237], v[188:189], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[228:229], v[244:245], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[236:237], v[252:253], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[252:253], v[188:189], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[244:245], v[244:245], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[252:253], v[252:253], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s56, s56, 0
\n
"
" s_add_u32 s12, s56, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_cmp_ge_u32 s80, 0x00000100
\n
"
" s_cselect_b32 s59, 0x00000100, s59
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" v_mul_f32 %[c16], %[scale_0], %[c16]
\n
"
" v_mul_f32 %[c17], %[scale_0], %[c17]
\n
"
" v_mul_f32 %[c18], %[scale_0], %[c18]
\n
"
" v_mul_f32 %[c19], %[scale_0], %[c19]
\n
"
" v_mul_f32 %[c20], %[scale_1], %[c20]
\n
"
" v_mul_f32 %[c21], %[scale_1], %[c21]
\n
"
" v_mul_f32 %[c22], %[scale_1], %[c22]
\n
"
" v_mul_f32 %[c23], %[scale_1], %[c23]
\n
"
" v_mul_f32 %[c24], %[scale_0], %[c24]
\n
"
" v_mul_f32 %[c25], %[scale_0], %[c25]
\n
"
" v_mul_f32 %[c26], %[scale_0], %[c26]
\n
"
" v_mul_f32 %[c27], %[scale_0], %[c27]
\n
"
" v_mul_f32 %[c28], %[scale_1], %[c28]
\n
"
" v_mul_f32 %[c29], %[scale_1], %[c29]
\n
"
" v_mul_f32 %[c30], %[scale_1], %[c30]
\n
"
" v_mul_f32 %[c31], %[scale_1], %[c31]
\n
"
_UK_PK_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c16]"
)
_UK_PK_CVT_
(
"%[c18]"
,
"%[c19]"
,
"%[c17]"
)
_UK_PK_CVT_
(
"%[c20]"
,
"%[c21]"
,
"%[c18]"
)
_UK_PK_CVT_
(
"%[c22]"
,
"%[c23]"
,
"%[c19]"
)
_UK_PK_CVT_
(
"%[c24]"
,
"%[c25]"
,
"%[c20]"
)
_UK_PK_CVT_
(
"%[c26]"
,
"%[c27]"
,
"%[c21]"
)
_UK_PK_CVT_
(
"%[c28]"
,
"%[c29]"
,
"%[c22]"
)
_UK_PK_CVT_
(
"%[c30]"
,
"%[c31]"
,
"%[c23]"
)
" s_addk_i32 s80, 0x0080
\n
"
" s_cmp_lt_i32 s80, %[s_loop_cnt]
\n
"
" s_cbranch_scc0 label_0EC1
\n
"
" s_branch label_0AA6
\n
"
" label_0EC1:
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_mov_b32 s52, 0x07060302 ; v_perm
\n
"
" s_mov_b64 s[38:39], exec ; save current exec
\n
"
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" s_mov_b32 s59, 0
\n
"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]
\n
"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]
\n
"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]
\n
"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]
\n
"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]
\n
"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]
\n
"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]
\n
"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]
\n
"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]
\n
"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]
\n
"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]
\n
"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]
\n
"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]
\n
"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]
\n
"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]
\n
"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]
\n
"
" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]
\n
"
" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]
\n
"
" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]
\n
"
" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]
\n
"
" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]
\n
"
" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]
\n
"
" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]
\n
"
" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]
\n
"
" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]
\n
"
" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]
\n
"
" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]
\n
"
" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]
\n
"
" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]
\n
"
" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]
\n
"
" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]
\n
"
" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]
\n
"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]
\n
"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]
\n
"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]
\n
"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]
\n
"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]
\n
"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]
\n
"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]
\n
"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]
\n
"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]
\n
"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]
\n
"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]
\n
"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]
\n
"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]
\n
"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]
\n
"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]
\n
"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]
\n
"
" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]
\n
"
" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]
\n
"
" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]
\n
"
" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]
\n
"
" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]
\n
"
" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]
\n
"
" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]
\n
"
" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]
\n
"
" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]
\n
"
" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]
\n
"
" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]
\n
"
" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]
\n
"
" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]
\n
"
" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]
\n
"
" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]
\n
"
" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]
\n
"
" s_waitcnt 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_mov_b32 v64, 0
\n
"
" v_mov_b32 v80, 0
\n
"
" v_mov_b32 v65, 0
\n
"
" v_mov_b32 v81, 0
\n
"
" v_mov_b32 v66, 0
\n
"
" v_mov_b32 v82, 0
\n
"
" v_mov_b32 v67, 0
\n
"
" v_mov_b32 v83, 0
\n
"
" v_mov_b32 v68, 0
\n
"
" v_mov_b32 v84, 0
\n
"
" v_mov_b32 v69, 0
\n
"
" v_mov_b32 v85, 0
\n
"
" v_mov_b32 v70, 0
\n
"
" v_mov_b32 v86, 0
\n
"
" v_mov_b32 v71, 0
\n
"
" v_mov_b32 v87, 0
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168
\n
"
" s_mov_b32 s80, 0
\n
"
" s_waitcnt vmcnt(24)
\n
"
"label_0AA6:
\n
"
" s_waitcnt vmcnt(30) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67]
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s56, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s56, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_cmp_ge_u32 s80, 0x00000100
\n
"
" s_cselect_b32 s59, %[s_tile_os_o], s59
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" v_mul_f32 %[c0], %[scale_0], %[c0]
\n
"
" v_mul_f32 %[c1], %[scale_0], %[c1]
\n
"
" v_mul_f32 %[c2], %[scale_0], %[c2]
\n
"
" v_mul_f32 %[c3], %[scale_0], %[c3]
\n
"
" v_mul_f32 %[c4], %[scale_1], %[c4]
\n
"
" v_mul_f32 %[c5], %[scale_1], %[c5]
\n
"
" v_mul_f32 %[c6], %[scale_1], %[c6]
\n
"
" v_mul_f32 %[c7], %[scale_1], %[c7]
\n
"
" v_mul_f32 %[c8], %[scale_0], %[c8]
\n
"
" v_mul_f32 %[c9], %[scale_0], %[c9]
\n
"
" v_mul_f32 %[c10], %[scale_0], %[c10]
\n
"
" v_mul_f32 %[c11], %[scale_0], %[c11]
\n
"
" v_mul_f32 %[c12], %[scale_1], %[c12]
\n
"
" v_mul_f32 %[c13], %[scale_1], %[c13]
\n
"
" v_mul_f32 %[c14], %[scale_1], %[c14]
\n
"
" v_mul_f32 %[c15], %[scale_1], %[c15]
\n
"
_UK_PK_CVT_
(
"%[c0]"
,
"%[c1]"
,
"%[c0]"
)
_UK_PK_CVT_
(
"%[c2]"
,
"%[c3]"
,
"%[c1]"
)
_UK_PK_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c2]"
)
_UK_PK_CVT_
(
"%[c6]"
,
"%[c7]"
,
"%[c3]"
)
_UK_PK_CVT_
(
"%[c8]"
,
"%[c9]"
,
"%[c4]"
)
_UK_PK_CVT_
(
"%["
"c10]"
,
"%["
"c11]"
,
"%[c5]"
)
_UK_PK_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c6]"
)
_UK_PK_CVT_
(
"%[c14]"
,
"%[c15]"
,
"%[c7]"
)
" s_addk_i32 s80, 0x0080
\n
"
" s_cmp_lt_i32 s80, %[s_loop_cnt]
\n
"
" s_cbranch_scc0 label_0EC1
\n
"
" s_waitcnt vmcnt(30) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_write_b64 v3, v[64:65] offset:16640
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" ds_write_b64 v3, v[66:67] offset:20992
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83]
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_write_b64 v3, v[68:69] offset:18816
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83]
\n
"
" ds_write_b64 v3, v[70:71] offset:23168
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83]
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83]
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_waitcnt vmcnt(30)
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], "
"v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[236:237], "
"v[188:189], v[80:83]
\n
"
_UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[228:229], "
"v[244:245], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87] "
"
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], "
"v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[236:237], "
"v[252:253], v[84:87]
\n
"
_UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], "
"v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen "
"offset:1024
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[252:253], "
"v[188:189], v[88:91]
\n
"
_UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen "
"offset:2048
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[244:245], "
"v[244:245], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95] "
"
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], "
"v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen "
"offset:3072
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[252:253], "
"v[252:253], v[92:95]
\n
"
_UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_add_u32 s60, 0x00000100, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s56, s56, 0
\n
"
" s_add_u32 s12, s56, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_cmp_ge_u32 s80, 0x00000100
\n
"
" s_cselect_b32 s59, 0x00000100, s59
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" v_mul_f32 %[c16], %[scale_0], %[c16]
\n
"
" v_mul_f32 %[c17], %[scale_0], %[c17]
\n
"
" v_mul_f32 %[c18], %[scale_0], %[c18]
\n
"
" v_mul_f32 %[c19], %[scale_0], %[c19]
\n
"
" v_mul_f32 %[c20], %[scale_1], %[c20]
\n
"
" v_mul_f32 %[c21], %[scale_1], %[c21]
\n
"
" v_mul_f32 %[c22], %[scale_1], %[c22]
\n
"
" v_mul_f32 %[c23], %[scale_1], %[c23]
\n
"
" v_mul_f32 %[c24], %[scale_0], %[c24]
\n
"
" v_mul_f32 %[c25], %[scale_0], %[c25]
\n
"
" v_mul_f32 %[c26], %[scale_0], %[c26]
\n
"
" v_mul_f32 %[c27], %[scale_0], %[c27]
\n
"
" v_mul_f32 %[c28], %[scale_1], %[c28]
\n
"
" v_mul_f32 %[c29], %[scale_1], %[c29]
\n
"
" v_mul_f32 %[c30], %[scale_1], %[c30]
\n
"
" v_mul_f32 %[c31], %[scale_1], %[c31]
\n
"
_UK_PK_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c16]"
)
_UK_PK_CVT_
(
"%[c18]"
,
"%[c19]"
,
"%[c17]"
)
_UK_PK_CVT_
(
"%[c20]"
,
"%[c21]"
,
"%[c18]"
)
_UK_PK_CVT_
(
"%[c22]"
,
"%[c23]"
,
"%[c19]"
)
_UK_PK_CVT_
(
"%[c24]"
,
"%[c25]"
,
"%[c20]"
)
_UK_PK_CVT_
(
"%[c26]"
,
"%[c27]"
,
"%[c21]"
)
_UK_PK_CVT_
(
"%[c28]"
,
"%[c29]"
,
"%[c22]"
)
_UK_PK_CVT_
(
"%[c30]"
,
"%[c31]"
,
"%[c23]"
)
" s_addk_i32 s80, 0x0080
\n
"
" s_cmp_lt_i32 s80, %[s_loop_cnt]
\n
"
" s_cbranch_scc0 label_0EC1
\n
"
" s_branch label_0AA6
\n
"
" label_0EC1:
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39] "
"
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] "
"offset:25344
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] "
"offset:29696
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] "
"offset:27520
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] "
"offset:31872
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344
\n
"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376
\n
"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408
\n
"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440
\n
"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696
\n
"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728
\n
"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760
\n
"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
#undef _UK_MFMA_
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
View file @
2a768425
...
...
@@ -9,508 +9,509 @@
#endif
"s_mov_b32 s16, %[s_res_a0]
\n
"
"s_mov_b32 s17, %[s_res_a1]
\n
"
"s_mov_b32 s18, %[s_res_a2]
\n
"
"s_mov_b32 s19, %[s_res_a3]
\n
"
"s_mov_b32 s20, %[s_res_b0]
\n
"
"s_mov_b32 s21, %[s_res_b1]
\n
"
"s_mov_b32 s22, %[s_res_b2]
\n
"
"s_mov_b32 s23, %[s_res_b3]
\n
"
// "s_nop 4\n"
"; -- prefetch A0
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"; -- prefetch A1
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"; -- prefetch B0
\n
"
"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_b], 0 ; move b with cond
\n
"
"s_add_u32 s20, s86, s20 ; move b with cond
\n
"
"s_addc_u32 s21, 0, s21 ; move b with cond
\n
"
"s_waitcnt vmcnt(40)
\n
"
"s_barrier
\n
"
"ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
// 1024: N stride, 64 K stride
"ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
"ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
"ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
"ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
"ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
"ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
"ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
"L_start%=:
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4]
\n
"
" ds_read_b128 v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4]
\n
"
" ds_read_b128 v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5]
\n
"
" ds_read_b128 v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5]
\n
"
" ds_read_b128 v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6]
\n
"
" ds_read_b128 v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6]
\n
"
" ds_read_b128 v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7]
\n
"
" ds_read_b128 v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7]
\n
"
" ds_read_b128 v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s20, s86, s20
\n
"
" s_addc_u32 s21, 0, s21
\n
"
" ;------------------------------------------
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, 0, %[s_m0_init]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4]
\n
"
" ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4]
\n
"
" ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5]
\n
"
" ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5]
\n
"
" ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6]
\n
"
" ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6]
\n
"
" ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7]
\n
"
" ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7]
\n
"
" ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s20, s86, s20
\n
"
" s_addc_u32 s21, 0, s21
\n
"
" s_branch L_start%=
\n
"
"L_end%=:
\n
"
" s_nop 2
\n
"
"s_mov_b32 s17, %[s_res_a1]
\n
"
"s_mov_b32 s18, %[s_res_a2]
\n
"
"s_mov_b32 s19, %[s_res_a3]
\n
"
"s_mov_b32 s20, %[s_res_b0]
\n
"
"s_mov_b32 s21, %[s_res_b1]
\n
"
"s_mov_b32 s22, %[s_res_b2]
\n
"
"s_mov_b32 s23, %[s_res_b3]
\n
"
// "s_nop 4\n"
"; -- prefetch A0
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"; -- prefetch A1
\n
"
"buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, %[s_size_per_issue], m0
\n
"
"buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond
\n
"
"s_add_u32 s16, s86, s16 ; move a with cond
\n
"
"s_addc_u32 s17, 0, s17 ; move a with cond
\n
"
"; -- prefetch B0
\n
"
"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
"s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
"s_cselect_b32 s86, %[s_tile_os_b], 0 ; move b with cond
\n
"
"s_add_u32 s20, s86, s20 ; move b with cond
\n
"
"s_addc_u32 s21, 0, s21 ; move b with cond
\n
"
"s_waitcnt vmcnt(40)
\n
"
"s_barrier
\n
"
"ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
// 1024: N stride, 64
// K stride
"ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
"ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
"ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
"ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
"ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
"ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
"ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
"L_start%=:
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4]
\n
"
" ds_read_b128 v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0] "
"
\n
"
_UK_MFMA_
" %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4]
\n
"
" ds_read_b128 v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1] "
"
\n
"
_UK_MFMA_
" %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5]
\n
"
" ds_read_b128 v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2] "
"
\n
"
_UK_MFMA_
" %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5]
\n
"
" ds_read_b128 v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3] "
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6]
\n
"
" ds_read_b128 v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4] "
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6]
\n
"
" ds_read_b128 v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5] "
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7]
\n
"
" ds_read_b128 v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6] "
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7]
\n
"
" ds_read_b128 v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s20, s86, s20
\n
"
" s_addc_u32 s21, 0, s21
\n
"
" ;------------------------------------------
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0]
\n
"
_UK_MFMA_
" %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0]
\n
"
" buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1]
\n
"
_UK_MFMA_
" %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1]
\n
"
" buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2]
\n
"
_UK_MFMA_
" %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2]
\n
"
" buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
" %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3]
\n
"
_UK_MFMA_
" %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3]
\n
"
" buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds
\n
"
" s_add_u32 m0, 0, %[s_m0_init]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4]
\n
"
" ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4]
\n
"
_UK_MFMA_
" %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4]
\n
"
" ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5]
\n
"
" ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2] "
"
\n
"
_UK_MFMA_
" %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5]
\n
"
_UK_MFMA_
" %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5]
\n
"
" ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3] "
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6]
\n
"
" ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4] "
"
\n
"
_UK_MFMA_
" %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6]
\n
"
_UK_MFMA_
" %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6]
\n
"
" ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5] "
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7]
\n
"
" ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6] "
"
\n
"
_UK_MFMA_
" %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7]
\n
"
_UK_MFMA_
" %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7]
\n
"
" ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11]
\n
"
_UK_MFMA_
" %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen
\n
"
_UK_MFMA_
" %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024
\n
"
_UK_MFMA_
" %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048
\n
"
_UK_MFMA_
" %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072
\n
"
_UK_MFMA_
" %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15]
\n
"
_UK_MFMA_
" %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_a], 0
\n
"
" s_add_u32 s16, s86, s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s20, s86, s20
\n
"
" s_addc_u32 s21, 0, s21
\n
"
" s_branch L_start%=
\n
"
"L_end%=:
\n
"
" s_nop 2
\n
"
#undef _UK_MFMA_
include/ck_tile/ops/fmha.hpp
View file @
2a768425
...
...
@@ -29,6 +29,8 @@
#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_appendkv_pipeline_default_policy.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs_default_policy.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_enum.hpp"
...
...
include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
View file @
2a768425
...
...
@@ -71,7 +71,8 @@ struct FmhaFwdKernel
using
bfs
=
typename
FmhaPipeline
::
BlockFmhaShape
;
using
g0br
=
typename
bfs
::
Gemm0BlockWarps
;
using
g1br
=
typename
bfs
::
Gemm1BlockWarps
;
using
gwt
=
typename
bfs
::
Gemm0WarpTile
;
using
g0wt
=
typename
bfs
::
Gemm0WarpTile
;
using
g1wt
=
typename
bfs
::
Gemm1WarpTile
;
#define _SS_ std::string
#define _TS_ std::to_string
auto
pn
=
[
&
]
()
{
...
...
@@ -88,7 +89,8 @@ struct FmhaFwdKernel
_TS_
(
bfs
::
kN1
)
+
"x"
+
_TS_
(
bfs
::
kK1
)
+
"x"
+
_TS_
(
bfs
::
kQKHeaddim
)
+
"_"
+
"r"
+
_TS_
(
g0br
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
g0br
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
g0br
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
"r"
+
_TS_
(
g1br
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
g1br
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
g1br
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
"w"
+
_TS_
(
gwt
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
gwt
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
gwt
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
"w"
+
_TS_
(
g0wt
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
g0wt
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
g0wt
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
"w"
+
_TS_
(
g1wt
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
g1wt
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
g1wt
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
(
kBlockPerCuInput
==
-
1
?
""
:
(
"o"
+
_TS_
(
kBlockPerCu
)
+
"_"
))
+
_SS_
(
FmhaPipeline
::
name
)
+
"_"
+
"v"
+
(
std
::
is_same_v
<
VLayout
,
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
?
"r"
:
"c"
)
+
(
pn
.
empty
()
?
""
:
"_"
+
pn
)
+
(
BiasEnum
==
BlockAttentionBiasEnum
::
NO_BIAS
?
_SS_
(
""
)
:
(
_SS_
(
"_"
)
+
BlockAttentionBiasEnumToStr
<
BiasEnum
>::
name
))
+
...
...
include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
View file @
2a768425
...
...
@@ -8,9 +8,11 @@ namespace ck_tile {
template
<
typename
TilePartitioner_
,
typename
FmhaPipeline_
,
typename
EpiloguePipeline_
>
struct
FmhaFwdSplitKVCombineKernel
{
using
TilePartitioner
=
remove_cvref_t
<
TilePartitioner_
>
;
using
FmhaPipeline
=
remove_cvref_t
<
FmhaPipeline_
>
;
using
EpiloguePipeline
=
remove_cvref_t
<
EpiloguePipeline_
>
;
using
TilePartitioner
=
remove_cvref_t
<
TilePartitioner_
>
;
using
FmhaPipeline
=
remove_cvref_t
<
FmhaPipeline_
>
;
using
EpiloguePipeline
=
remove_cvref_t
<
EpiloguePipeline_
>
;
static
constexpr
index_t
kNumWarps
=
FmhaPipeline
::
kNumWarps
;
static
constexpr
index_t
kBlockSize
=
FmhaPipeline
::
kBlockSize
;
static
constexpr
index_t
kBlockPerCu
=
FmhaPipeline
::
kBlockPerCu
;
static_assert
(
kBlockPerCu
>
0
);
...
...
@@ -50,8 +52,7 @@ struct FmhaFwdSplitKVCombineKernel
return
_SS_
(
"fmha_fwd_splitkv_combine_d"
)
+
_TS_
(
FmhaPipeline
::
kHeadDimV
)
+
"_"
+
_SS_
(
t2s
<
ODataType
>::
name
)
+
"_"
+
(
kIsGroupMode
?
"group"
:
"batch"
)
+
"_"
"b"
+
_TS_
(
FmhaPipeline
::
kM0
)
+
"x"
+
_TS_
(
FmhaPipeline
::
kN1
)
+
"_"
+
"b"
+
_TS_
(
FmhaPipeline
::
kN1
)
+
"_"
+
(
kBlockPerCuInput
==
-
1
?
""
:
(
"o"
+
_TS_
(
kBlockPerCu
)
+
"_"
))
+
_SS_
(
FmhaPipeline
::
name
)
+
(
pn
.
empty
()
?
""
:
"_"
+
pn
)
+
...
...
@@ -339,37 +340,56 @@ struct FmhaFwdSplitKVCombineKernel
number
<
FmhaPipeline
::
kAlignmentOacc
>
{},
number
<
1
>
{});
// read 4 * (kM0, kN1) o_acc tiles simultaneously by 4 warps
const
auto
o_acc_dram_view
=
pad_tensor_view
(
o_acc_dram_naive
,
make_tuple
(
number
<
1
>
{},
number
<
FmhaPipeline
::
kM0
>
{},
number
<
FmhaPipeline
::
kN1
>
{}),
sequence
<
false
,
kPadSeqLenQ
,
kPadHeadDimV
>
{});
make_tuple
(
number
<
kNumWarps
>
{},
number
<
FmhaPipeline
::
kM0
>
{},
number
<
FmhaPipeline
::
kN1
>
{}),
sequence
<
true
,
kPadSeqLenQ
,
kPadHeadDimV
>
{});
const
index_t
padded_num_splits
=
o_acc_dram_view
.
get_tensor_descriptor
().
get_lengths
()[
number
<
0
>
{}];
const
index_t
padded_seqlen_q
=
o_acc_dram_view
.
get_tensor_descriptor
().
get_lengths
()[
number
<
1
>
{}];
const
index_t
padded_hdim_v
=
o_acc_dram_view
.
get_tensor_descriptor
().
get_lengths
()[
number
<
2
>
{}];
return
transform_tensor_view
(
const
index_t
num_m_tiles
=
integer_divide_floor
(
padded_seqlen_q
,
FmhaPipeline
::
kM0
);
// transform tensor view by following steps, given shape: (padded_num_splits,
// padded_seqlen_q, padded_hdim_v)
// 1. unmerge to (padded_num_splits, num_m_tiles, kM0, padded_hdim_v)
// 2. transpose to (num_m_tiles, padded_num_splits, kM0, padded_hdim_v)
// 3. merge to (num_m_tiles * padded_num_splits * kM0, padded_hdim_v)
auto
transposed
=
transform_tensor_view
(
o_acc_dram_view
,
make_tuple
(
make_merge_transform
(
make_tuple
(
kargs
.
num_splits
,
padded_seqlen_q
)),
make_tuple
(
make_pass_through_transform
(
padded_num_splits
),
make_unmerge_transform
(
make_tuple
(
num_m_tiles
,
FmhaPipeline
::
kM0
)),
make_pass_through_transform
(
padded_hdim_v
)),
make_tuple
(
sequence
<
0
,
1
>
{},
sequence
<
2
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
>
{}),
make_tuple
(
sequence
<
1
>
{},
sequence
<
0
,
2
>
{},
sequence
<
3
>
{}));
return
transform_tensor_view
(
transposed
,
make_tuple
(
make_merge_transform
(
make_tuple
(
num_m_tiles
,
padded_num_splits
,
FmhaPipeline
::
kM0
)),
make_pass_through_transform
(
padded_hdim_v
)),
make_tuple
(
sequence
<
0
,
1
,
2
>
{},
sequence
<
3
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
}();
auto
lse_acc_dram_window
=
make_tile_window
(
lse_acc_dram
,
[
&
]()
{
return
make_tuple
(
number
<
FmhaPipeline
::
kMaxSplits
>
{},
number
<
FmhaPipeline
::
kM0
>
{});
}(),
make_tuple
(
number
<
FmhaPipeline
::
kMaxSplits
>
{},
number
<
FmhaPipeline
::
kM0
>
{}),
{
0
,
i_m0
});
const
index_t
padded_num_splits
=
integer_divide_ceil
(
kargs
.
num_splits
,
kNumWarps
)
*
kNumWarps
;
auto
o_acc_dram_window
=
make_tile_window
(
o_acc_dram
,
[
&
]()
{
return
make_tuple
(
number
<
FmhaPipeline
::
kM0
>
{},
number
<
FmhaPipeline
::
kN1
>
{});
}(),
{
i_m0
,
i_n1
});
make_tuple
(
number
<
kNumWarps
*
FmhaPipeline
::
kM0
>
{},
number
<
FmhaPipeline
::
kN1
>
{}),
{
i_tile_m
*
padded_num_splits
*
FmhaPipeline
::
kM0
,
i_n1
});
// LSE DRAM window
auto
lse_dram_window
=
[
&
,
i_nhead_
=
i_nhead
]()
{
...
...
@@ -410,7 +430,6 @@ struct FmhaFwdSplitKVCombineKernel
identity
{},
// lse_element_func
composes
(
saturates
<
fp8_t
>
{},
scales
{
kargs
.
scale_o
}),
// o_acc_element_func
kargs
.
num_splits
,
kargs
.
seqlen_q
,
smem_ptr
);
}
else
...
...
@@ -419,7 +438,6 @@ struct FmhaFwdSplitKVCombineKernel
o_acc_dram_window
,
lse_dram_window
,
kargs
.
num_splits
,
kargs
.
seqlen_q
,
smem_ptr
);
}
}();
...
...
include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
View file @
2a768425
...
...
@@ -45,6 +45,7 @@ struct FmhaFwdSplitKVKernel
static
constexpr
bool
kPadHeadDimQ
=
FmhaPipeline
::
kPadHeadDimQ
;
static
constexpr
bool
kPadHeadDimV
=
FmhaPipeline
::
kPadHeadDimV
;
static
constexpr
auto
BiasEnum
=
FmhaPipeline
::
BiasEnum
;
static
constexpr
bool
kStoreLSE
=
FmhaPipeline
::
kStoreLSE
;
static
constexpr
bool
kDoFp8StaticQuant
=
FmhaPipeline
::
Problem
::
kDoFp8StaticQuant
;
static
constexpr
bool
kIsPagedKV
=
FmhaPipeline
::
Problem
::
kIsPagedKV
;
...
...
@@ -67,7 +68,8 @@ struct FmhaFwdSplitKVKernel
using
bfs
=
typename
FmhaPipeline
::
BlockFmhaShape
;
using
g0br
=
typename
bfs
::
Gemm0BlockWarps
;
using
g1br
=
typename
bfs
::
Gemm1BlockWarps
;
using
gwt
=
typename
bfs
::
Gemm0WarpTile
;
using
g0wt
=
typename
bfs
::
Gemm0WarpTile
;
using
g1wt
=
typename
bfs
::
Gemm1WarpTile
;
#define _SS_ std::string
#define _TS_ std::to_string
auto
pn
=
[
&
]
()
{
...
...
@@ -84,11 +86,12 @@ struct FmhaFwdSplitKVKernel
_TS_
(
bfs
::
kN1
)
+
"x"
+
_TS_
(
bfs
::
kK1
)
+
"x"
+
_TS_
(
bfs
::
kQKHeaddim
)
+
"_"
+
"r"
+
_TS_
(
g0br
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
g0br
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
g0br
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
"r"
+
_TS_
(
g1br
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
g1br
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
g1br
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
"w"
+
_TS_
(
gwt
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
gwt
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
gwt
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
"w"
+
_TS_
(
g0wt
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
g0wt
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
g0wt
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
"w"
+
_TS_
(
g1wt
::
at
(
ck_tile
::
number
<
0
>
{}))
+
"x"
+
_TS_
(
g1wt
::
at
(
ck_tile
::
number
<
1
>
{}))
+
"x"
+
_TS_
(
g1wt
::
at
(
ck_tile
::
number
<
2
>
{}))
+
"_"
+
(
kBlockPerCuInput
==
-
1
?
""
:
(
"o"
+
_TS_
(
kBlockPerCu
)
+
"_"
))
+
_SS_
(
FmhaPipeline
::
name
)
+
"_"
+
"v"
+
(
std
::
is_same_v
<
VLayout
,
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
?
"r"
:
"c"
)
+
(
pn
.
empty
()
?
""
:
"_"
+
pn
)
+
(
BiasEnum
==
BlockAttentionBiasEnum
::
NO_BIAS
?
_SS_
(
""
)
:
(
_SS_
(
"_"
)
+
BlockAttentionBiasEnumToStr
<
BiasEnum
>::
name
))
+
(
kHasMask
?
"_"
+
_SS_
(
FmhaMask
::
name
)
:
""
)
+
(
kDoFp8StaticQuant
?
"_squant"
:
""
)
+
(
kIsPagedKV
?
"_pagedkv"
:
""
);
(
kHasMask
?
"_"
+
_SS_
(
FmhaMask
::
name
)
:
""
)
+
(
kStoreLSE
?
"_lse"
:
""
)
+
(
kDoFp8StaticQuant
?
"_squant"
:
""
)
+
(
kIsPagedKV
?
"_pagedkv"
:
""
);
#undef _SS_
#undef _TS_
// clang-format on
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp
View file @
2a768425
...
...
@@ -53,6 +53,7 @@ struct BlockFmhaFwdSplitKVCombinePipeline
using
OaccDataType
=
remove_cvref_t
<
typename
Problem
::
OaccDataType
>
;
using
ODataType
=
remove_cvref_t
<
typename
Problem
::
ODataType
>
;
static
constexpr
index_t
kNumWarps
=
Problem
::
kNumWarps
;
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kHeadDimV
=
Problem
::
kHeadDimV
;
...
...
@@ -117,7 +118,6 @@ struct BlockFmhaFwdSplitKVCombinePipeline
const
LSEElementFunction
&
lse_element_func
,
const
OaccElementFunction
&
o_acc_element_func
,
index_t
num_splits
,
index_t
seqlen_q
,
void
*
smem_ptr
)
const
{
// lse_acc tile in LDS
...
...
@@ -143,11 +143,12 @@ struct BlockFmhaFwdSplitKVCombinePipeline
// copy lse_acc tile (shape=[kMaxSplits, kM0]) to LDS (shape=[kMaxSplits, kM0]).
auto
lse_acc_tile
=
load_tile
(
lse_acc_dram_window
);
store_tile
(
lse_acc_lds_write_window
,
lse_acc_tile
);
block_sync_lds
();
auto
lse_accum
=
make_static_distributed_tensor
<
LSEDataType
>
(
Policy
::
template
MakeLSEaccRegTileDistribution
<
Problem
>());
__builtin_amdgcn_sched_barrier
(
0
);
block_sync_lds
();
// copy LDS (shape=[kM0, kMaxSplits]) to lse_accum (shape=[kM0, kMaxSplits])
// and fill up -INF values outside the [kM0, num_splits] region.
{
...
...
@@ -264,46 +265,94 @@ struct BlockFmhaFwdSplitKVCombinePipeline
}
});
}
block_sync_lds
();
if
constexpr
(
kStoreLSE
)
{
store_tile
(
lse_dram_window_tmp
,
tile_elementwise_in
(
lse_element_func
,
lse_logsum
));
}
auto
o_acc_dist
=
Policy
::
template
MakeOaccDramTileDistribution
<
Problem
>();
auto
o_acc_dram_window
=
auto
o_acc_
4_
dist
=
Policy
::
template
MakeOacc
4
DramTileDistribution
<
Problem
>();
auto
o_acc_
4_
dram_window
=
make_tile_window
(
o_acc_dram_block_window_tmp
.
get_bottom_tensor_view
(),
o_acc_dram_block_window_tmp
.
get_window_lengths
(),
o_acc_dram_block_window_tmp
.
get_window_origin
(),
o_acc_dist
);
auto
o_acc
=
make_static_distributed_tensor
<
OaccDataType
>
(
o_acc_dist
);
clear_tile
(
o_acc
);
o_acc_4_dist
);
const
index_t
padded_seqlen_q
=
integer_divide_ceil
(
seqlen_q
,
kM0
)
*
kM0
;
// shape=[4 * KM0, kN1]
auto
o_acc_4
=
make_static_distributed_tensor
<
OaccDataType
>
(
o_acc_4_dist
);
clear_tile
(
o_acc_4
);
for
(
index_t
i_split
=
0
;
i_split
<
num_splits
;
++
i_split
)
const
index_t
padded_num_splits
=
integer_divide_ceil
(
num_splits
,
kNumWarps
)
*
kNumWarps
;
__builtin_amdgcn_sched_barrier
(
0
);
block_sync_lds
();
// each warp handles a [KM0, kN1] tile
for
(
index_t
split_start
=
0
;
split_start
<
padded_num_splits
;
split_start
+=
kNumWarps
)
{
auto
o_tile
=
load_tile
(
o_acc_dram_window
);
auto
o_tile
=
load_tile
(
o_acc_4_dram_window
);
const
index_t
i_split
=
split_start
+
get_warp_id
();
const
index_t
row_start
=
kM0
*
get_warp_id
();
{
constexpr
auto
spans
=
decltype
(
o_acc
)
::
get_distributed_spans
();
constexpr
auto
spans
=
decltype
(
o_acc
_4
)
::
get_distributed_spans
();
sweep_tile_span
(
spans
[
number
<
0
>
{}],
[
&
](
auto
idx0
)
{
sweep_tile_span
(
spans
[
number
<
1
>
{}],
[
&
](
auto
idx1
)
{
constexpr
auto
i_j_idx
=
make_tuple
(
idx0
,
idx1
);
const
auto
x_indices
=
get_x_indices_from_distributed_indices
(
o_acc
.
get_tile_distribution
(),
i_j_idx
);
o_acc
_4
.
get_tile_distribution
(),
i_j_idx
);
const
auto
row
=
x_indices
.
at
(
number
<
0
>
{});
const
LSEDataType
lse_scale
=
lse_acc_lds
(
row
,
i_split
);
o_acc
(
i_j_idx
)
+=
lse_scale
*
o_tile
(
i_j_idx
);
const
LSEDataType
lse_scale
=
lse_acc_lds
(
row
-
row_start
,
i_split
);
o_acc
_4
(
i_j_idx
)
+=
lse_scale
*
o_tile
(
i_j_idx
);
});
});
}
move_tile_window
(
o_acc_dram_window
,
{
padded_seqlen_q
,
0
});
move_tile_window
(
o_acc_4_dram_window
,
{
kNumWarps
*
kM0
,
0
});
}
// 4 o_acc tiles in LDS. shape=[4 * kM0, kN1]
OaccDataType
*
o_acc_4_lds_ptr
=
static_cast
<
OaccDataType
*>
(
static_cast
<
void
*>
(
static_cast
<
char
*>
(
smem_ptr
)
+
Policy
::
template
GetSmemSizeLSEacc
<
Problem
>()));
{
auto
o_acc_4_lds_window
=
[
&
]()
{
auto
desc
=
Policy
::
template
MakeOacc4LdsBlockDescriptor
<
Problem
>();
auto
view
=
make_tensor_view
<
address_space_enum
::
lds
>
(
o_acc_4_lds_ptr
,
desc
);
return
make_tile_window
(
view
,
desc
.
get_lengths
(),
{
0
,
0
});
}();
store_tile
(
o_acc_4_lds_window
,
o_acc_4
);
}
auto
o_acc_dist
=
Policy
::
template
MakeOaccDramTileDistribution
<
Problem
>();
auto
o_acc_4_lds_window
=
[
&
]()
{
auto
desc
=
Policy
::
template
MakeOacc4LdsBlockDescriptor
<
Problem
>();
auto
view
=
make_tensor_view
<
address_space_enum
::
lds
>
(
o_acc_4_lds_ptr
,
desc
);
return
make_tile_window
(
view
,
desc
.
get_lengths
(),
{
0
,
0
},
o_acc_dist
);
}();
auto
o_acc
=
make_static_distributed_tensor
<
OaccDataType
>
(
o_acc_dist
);
clear_tile
(
o_acc
);
__builtin_amdgcn_sched_barrier
(
0
);
block_sync_lds
();
static_for
<
0
,
kNumWarps
,
1
>
{}([
&
](
auto
)
{
auto
o_acc_in
=
load_tile
(
o_acc_4_lds_window
);
{
constexpr
auto
spans
=
decltype
(
o_acc
)
::
get_distributed_spans
();
sweep_tile_span
(
spans
[
number
<
0
>
{}],
[
&
](
auto
idx0
)
{
sweep_tile_span
(
spans
[
number
<
1
>
{}],
[
&
](
auto
idx1
)
{
constexpr
auto
i_j_idx
=
make_tuple
(
idx0
,
idx1
);
o_acc
(
i_j_idx
)
+=
o_acc_in
(
i_j_idx
);
});
});
}
move_tile_window
(
o_acc_4_lds_window
,
{
kM0
,
0
});
});
o_acc
=
tile_elementwise_in
(
o_acc_element_func
,
o_acc
);
return
o_acc
;
...
...
@@ -316,7 +365,6 @@ struct BlockFmhaFwdSplitKVCombinePipeline
const
OaccDramBlockWindow
&
o_acc_dram_block_window
,
LSEDramBlockWindow
&
lse_dram_block_window
,
index_t
num_splits
,
index_t
seqlen_q
,
void
*
smem_ptr
)
const
{
return
operator
()(
lse_acc_dram_block_window
,
...
...
@@ -325,7 +373,6 @@ struct BlockFmhaFwdSplitKVCombinePipeline
identity
{},
identity
{},
num_splits
,
seqlen_q
,
smem_ptr
);
}
};
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp
View file @
2a768425
...
...
@@ -10,23 +10,38 @@ namespace ck_tile {
struct
BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
{
template
<
index_t
BlockSize
,
index_t
M
,
index_t
N
,
typename
DataType
>
template
<
index_t
NumWarps
,
index_t
M
,
index_t
N
,
typename
DataType
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetMaxNumWarpsForTile
()
{
static_assert
(
NumWarps
==
1
||
NumWarps
==
2
||
NumWarps
==
4
);
constexpr
index_t
ElemPerThread
=
(
M
*
N
)
/
(
NumWarps
*
get_warp_size
());
if
constexpr
(
0
<
ElemPerThread
)
{
return
NumWarps
;
}
else
{
// try dividing tile by smaller # of warps
return
GetMaxNumWarpsForTile
<
NumWarps
/
2
,
M
,
N
,
DataType
>
();
}
}
template
<
index_t
NumWarps
,
index_t
M
,
index_t
N
,
typename
DataType
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetVectorSizeForTile
()
{
constexpr
index_t
PixelsPerThread
=
(
M
*
N
)
/
BlockSize
;
static_assert
(
0
<
PixelsPerThread
);
constexpr
index_t
MaxNumWarps
=
GetMaxNumWarpsForTile
<
NumWarps
,
M
,
N
,
DataType
>
();
constexpr
index_t
MaxNPerThread
=
16
/
sizeof
(
DataType
);
constexpr
index_t
NPerThread
=
min
(
MaxNPerThread
,
PixelsPerThread
);
constexpr
index_t
ElemPerThread
=
(
M
*
N
)
/
(
MaxNumWarps
*
get_warp_size
());
return
NPerThread
;
constexpr
index_t
MaxNPerThread
=
16
/
sizeof
(
DataType
);
return
min
(
MaxNPerThread
,
ElemPerThread
);
}
// alignment for dram lse tile (shape=[kMaxSplits, kM0])
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetAlignmentLSE
()
{
return
GetVectorSizeForTile
<
Problem
::
k
BlockSize
,
return
GetVectorSizeForTile
<
Problem
::
k
NumWarps
,
Problem
::
kMaxSplits
,
Problem
::
kM0
,
typename
Problem
::
LSEDataType
>
();
...
...
@@ -56,40 +71,54 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize
()
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize
LSEacc
()
{
return
sizeof
(
typename
Problem
::
LSEDataType
)
*
MakeLSEaccLdsBlockDescriptor
<
Problem
>
().
get_element_space_size
();
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSizeOacc4
()
{
return
sizeof
(
typename
Problem
::
OaccDataType
)
*
MakeOacc4LdsBlockDescriptor
<
Problem
>
().
get_element_space_size
();
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize
()
{
return
GetSmemSizeLSEacc
<
Problem
>
()
+
GetSmemSizeOacc4
<
Problem
>
();
}
// shape=[kMaxSplits, kM0]
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeLSEaccDramTileDistribution
()
{
using
LSEDataType
=
remove_cvref_t
<
typename
Problem
::
LSEDataType
>
;
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
constexpr
index_t
kNumWarps
=
Problem
::
kNumWarps
;
constexpr
index_t
kNPerBlock
=
Problem
::
kM0
;
constexpr
index_t
kMPerBlock
=
Problem
::
kMaxSplits
;
constexpr
index_t
kNPerBlock
=
Problem
::
kM0
;
constexpr
index_t
MaxNumWarps
=
GetMaxNumWarpsForTile
<
Problem
::
kNumWarps
,
kNPerBlock
,
kMPerBlock
,
LSEDataType
>
();
constexpr
index_t
Replicate
=
Problem
::
kNumWarps
/
MaxNumWarps
;
constexpr
index_t
NPerThread
=
GetVectorSizeForTile
<
kBlockSize
,
kMPerBlock
,
kNPerBlock
,
LSEDataType
>
();
GetVectorSizeForTile
<
MaxNumWarps
,
kMPerBlock
,
kNPerBlock
,
LSEDataType
>
();
constexpr
index_t
NThreads
=
kNPerBlock
/
NPerThread
;
constexpr
index_t
MThreadsPerWarp
=
get_warp_size
()
/
NThreads
;
constexpr
index_t
MPerThread
=
kMPerBlock
/
(
k
NumWarps
*
MThreadsPerWarp
);
constexpr
index_t
MPerThread
=
kMPerBlock
/
(
Max
NumWarps
*
MThreadsPerWarp
);
static_assert
(
MPerThread
*
MaxNumWarps
*
MThreadsPerWarp
==
kMPerBlock
);
static_assert
(
NThreads
*
NPerThread
==
kNPerBlock
);
static_assert
(
MPerThread
*
kNumWarps
*
MThreadsPerWarp
==
kMPerBlock
);
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
MPerThread
,
k
NumWarps
,
MThreadsPerWarp
>
,
tile_distribution_encoding
<
sequence
<
Replicate
>
,
tuple
<
sequence
<
MPerThread
,
Max
NumWarps
,
MThreadsPerWarp
>
,
sequence
<
NThreads
,
NPerThread
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
2
,
0
>>
,
tuple
<
sequence
<
0
,
1
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
0
,
1
>
,
sequence
<
2
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
1
>>
{});
}
...
...
@@ -100,17 +129,15 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
{
using
LSEDataType
=
remove_cvref_t
<
typename
Problem
::
LSEDataType
>
;
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
constexpr
index_t
kMPerBlock
=
Problem
::
kMaxSplits
;
constexpr
index_t
kNPerBlock
=
Problem
::
kM0
;
constexpr
index_t
kMPerBlock
=
Problem
::
kM0
;
constexpr
index_t
kNPerBlock
=
Problem
::
kMaxSplits
;
constexpr
index_t
NPack
=
GetVectorSizeForTile
<
kBlockSize
,
kMPerBlock
,
kNPerBlock
,
LSEDataType
>
();
GetVectorSizeForTile
<
Problem
::
kNumWarps
,
kMPerBlock
,
kNPerBlock
,
LSEDataType
>
();
constexpr
auto
lse_acc_lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
kNPerBlock
/
NPack
>
{},
number
<
kMPerBlock
>
{},
number
<
NPack
>
{}),
make_tuple
(
number
<
(
kMPerBlock
+
1
)
*
NPack
>
{},
number
<
NPack
>
{},
number
<
1
>
{}),
number
<
8
>
{},
number
<
NPack
>
{},
number
<
1
>
{});
constexpr
auto
lse_acc_lds_block_desc
=
transform_tensor_descriptor
(
...
...
@@ -129,17 +156,15 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
{
using
LSEDataType
=
remove_cvref_t
<
typename
Problem
::
LSEDataType
>
;
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
constexpr
index_t
kMPerBlock
=
Problem
::
kMaxSplits
;
constexpr
index_t
kNPerBlock
=
Problem
::
kM0
;
constexpr
index_t
kMPerBlock
=
Problem
::
kM0
;
constexpr
index_t
kNPerBlock
=
Problem
::
kMaxSplits
;
constexpr
index_t
NPack
=
GetVectorSizeForTile
<
kBlockSize
,
kMPerBlock
,
kNPerBlock
,
LSEDataType
>
();
GetVectorSizeForTile
<
Problem
::
kNumWarps
,
kMPerBlock
,
kNPerBlock
,
LSEDataType
>
();
constexpr
auto
lse_acc_lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
kNPerBlock
/
NPack
>
{},
number
<
kMPerBlock
>
{},
number
<
NPack
>
{}),
make_tuple
(
number
<
(
kMPerBlock
+
1
)
*
NPack
>
{},
number
<
NPack
>
{},
number
<
1
>
{}),
number
<
8
>
{},
number
<
NPack
>
{},
number
<
1
>
{});
constexpr
auto
lse_acc_t_lds_block_desc
=
transform_tensor_descriptor
(
...
...
@@ -152,33 +177,86 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
return
lse_acc_t_lds_block_desc
;
}
// 3d + padding, shape=[4 * kM0, kN1]
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
Make
LSEaccRegTileDistribution
()
CK_TILE_HOST_DEVICE
static
constexpr
auto
Make
Oacc4LdsBlockDescriptor
()
{
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
using
LSEDataType
=
remove_cvref_t
<
typename
Problem
::
LSEDataType
>
;
constexpr
index_t
kNPerBlock
=
Problem
::
kMaxSplits
;
constexpr
index_t
kMPerBlock
=
4
*
Problem
::
kM0
;
constexpr
index_t
kNPerBlock
=
Problem
::
kN1
;
constexpr
index_t
NPack
=
GetVectorSizeForTile
<
Problem
::
kNumWarps
,
kMPerBlock
,
kNPerBlock
,
LSEDataType
>
();
constexpr
auto
o_acc_lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
kNPerBlock
/
NPack
>
{},
number
<
kMPerBlock
>
{},
number
<
NPack
>
{}),
make_tuple
(
number
<
(
kMPerBlock
+
1
)
*
NPack
>
{},
number
<
NPack
>
{},
number
<
1
>
{}),
number
<
8
>
{},
number
<
1
>
{});
constexpr
auto
o_acc_t_lds_block_desc
=
transform_tensor_descriptor
(
o_acc_lds_block_desc_0
,
make_tuple
(
make_pass_through_transform
(
kMPerBlock
),
make_merge_transform
(
make_tuple
(
kNPerBlock
/
NPack
,
NPack
))),
make_tuple
(
sequence
<
1
>
{},
sequence
<
0
,
2
>
{}),
make_tuple
(
sequence
<
1
>
{},
sequence
<
0
>
{}));
return
o_acc_t_lds_block_desc
;
}
// shape=[kM0, kMaxSplits]
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeLSEaccRegTileDistribution
()
{
constexpr
index_t
kMPerBlock
=
Problem
::
kM0
;
constexpr
index_t
kNPerBlock
=
Problem
::
kMaxSplits
;
constexpr
index_t
NThreads
=
4
;
constexpr
index_t
NPerThread
=
kNPerBlock
/
NThreads
;
constexpr
index_t
MaxNThreads
=
8
;
constexpr
index_t
NThreads
=
min
(
kNPerBlock
,
MaxNThreads
);
constexpr
index_t
NPerThread
=
kNPerBlock
/
NThreads
;
constexpr
index_t
MThreads
=
kBlockSize
/
NThreads
;
constexpr
index_t
MPerThread
=
kMPerBlock
/
MThreads
;
constexpr
index_t
MWarps
=
kBlockSize
/
get_warp_size
();
constexpr
index_t
MPerThread
=
1
;
constexpr
index_t
MThreads
=
kMPerBlock
/
MPerThread
;
constexpr
index_t
MThreadPerWarp
=
get_warp_size
()
/
NThreads
;
constexpr
index_t
MaxNumWarps
=
(
MThreads
*
NThreads
)
/
get_warp_size
();
constexpr
index_t
Replicate
=
Problem
::
kNumWarps
/
MaxNumWarps
;
static_assert
(
MaxNumWarps
*
MThreadPerWarp
*
MPerThread
==
kMPerBlock
);
static_assert
(
NThreads
*
NPerThread
==
kNPerBlock
);
static_assert
(
MWarps
*
MThreadPerWarp
*
MPerThread
==
kMPerBlock
);
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
MWarps
,
MThreadPerWarp
,
MPerThread
>
,
sequence
<
NThreads
,
NPerThread
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
>
,
sequence
<
0
,
1
>>
,
sequence
<
1
,
2
>
,
sequence
<
2
,
1
>>
{});
tile_distribution_encoding
<
sequence
<
Replicate
>
,
tuple
<
sequence
<
MaxNumWarps
,
MThreadPerWarp
,
MPerThread
>
,
sequence
<
NThreads
,
NPerThread
>>
,
tuple
<
sequence
<
0
,
1
>
,
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
0
,
0
>
,
sequence
<
0
,
1
>>
,
sequence
<
1
,
2
>
,
sequence
<
2
,
1
>>
{});
}
// similar to MakeOaccDramTileDistribution(), but duplicate same 1-warp encoding 4 times on M
// direction
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeOacc4DramTileDistribution
()
{
constexpr
index_t
kMPerBlock
=
Problem
::
kM0
;
// real kMPerBlock we want is (4 * kM0)
constexpr
index_t
kNPerBlock
=
Problem
::
kN1
;
static_assert
(
get_warp_size
()
<=
kMPerBlock
*
kNPerBlock
);
constexpr
index_t
M1
=
1
;
// compose encoding base on 1 warp
constexpr
index_t
M2
=
min
(
kMPerBlock
/
M1
,
get_warp_size
());
constexpr
index_t
N0
=
get_warp_size
()
/
M2
;
constexpr
index_t
N1
=
kNPerBlock
/
N0
;
constexpr
index_t
M0
=
kMPerBlock
/
(
M2
*
M1
);
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
4
,
M0
,
M1
,
M2
>
,
sequence
<
N0
,
N1
>>
,
tuple
<
sequence
<
1
,
1
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
0
,
2
>
,
sequence
<
3
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
1
,
1
>>
{});
}
template
<
typename
Problem
>
...
...
@@ -187,6 +265,7 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
constexpr
index_t
kMPerBlock
=
Problem
::
kM0
;
constexpr
index_t
kNPerBlock
=
Problem
::
kN1
;
static_assert
(
kBlockSize
<=
kMPerBlock
*
kNPerBlock
);
constexpr
index_t
M1
=
kBlockSize
/
get_warp_size
();
constexpr
index_t
M2
=
min
(
kMPerBlock
/
M1
,
get_warp_size
());
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs.hpp
0 → 100644
View file @
2a768425
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/fmha/block/block_attention_bias_enum.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp"
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
namespace
ck_tile
{
// This pipeline is qkv all located in LDS
template
<
typename
Problem_
,
typename
Policy_
=
BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVSDefaultPolicy
>
struct
BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS
{
using
Problem
=
remove_cvref_t
<
Problem_
>
;
using
Policy
=
remove_cvref_t
<
Policy_
>
;
using
QDataType
=
remove_cvref_t
<
typename
Problem
::
QDataType
>
;
using
KDataType
=
remove_cvref_t
<
typename
Problem
::
KDataType
>
;
using
VDataType
=
remove_cvref_t
<
typename
Problem
::
VDataType
>
;
using
SaccDataType
=
remove_cvref_t
<
typename
Problem
::
SaccDataType
>
;
using
SMPLComputeDataType
=
remove_cvref_t
<
typename
Problem
::
SMPLComputeDataType
>
;
using
BiasDataType
=
remove_cvref_t
<
typename
Problem
::
BiasDataType
>
;
using
LSEDataType
=
remove_cvref_t
<
typename
Problem
::
LSEDataType
>
;
using
PDataType
=
remove_cvref_t
<
typename
Problem
::
PDataType
>
;
using
OaccDataType
=
remove_cvref_t
<
typename
Problem
::
OaccDataType
>
;
using
ODataType
=
remove_cvref_t
<
typename
Problem
::
ODataType
>
;
using
FmhaMask
=
remove_cvref_t
<
typename
Problem
::
FmhaMask
>
;
using
BlockFmhaShape
=
remove_cvref_t
<
typename
Problem
::
BlockFmhaShape
>
;
using
VLayout
=
remove_cvref_t
<
typename
BlockFmhaShape
::
VLayout
>
;
static
constexpr
bool
kQLoadOnce
=
true
;
// if q_tile load whole block length (hdim) at once
static_assert
(
kQLoadOnce
==
Policy
::
QLoadOnce
);
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kQKHeaddim
=
BlockFmhaShape
::
kQKHeaddim
;
static
constexpr
index_t
kSubQKHeaddim
=
BlockFmhaShape
::
kSubQKHeaddim
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kPadSeqLenQ
=
Problem
::
kPadSeqLenQ
;
static
constexpr
bool
kPadSeqLenK
=
Problem
::
kPadSeqLenK
;
static
constexpr
bool
kPadHeadDimQ
=
Problem
::
kPadHeadDimQ
;
static
constexpr
bool
kPadHeadDimV
=
Problem
::
kPadHeadDimV
;
static
constexpr
auto
BiasEnum
=
Problem
::
BiasEnum
;
static
constexpr
bool
kStoreLSE
=
Problem
::
kStoreLSE
;
static
constexpr
bool
kIsPagedKV
=
Problem
::
kIsPagedKV
;
static
constexpr
bool
kHasUnevenSplits
=
Problem
::
kHasUnevenSplits
;
// last dimension vector length used to create tensor view(and decide buffer_load vector length)
// ... together with tensor distribution. tensor dist should able to overwrite this
static
constexpr
index_t
kAlignmentQ
=
kPadHeadDimQ
?
1
:
Policy
::
template
GetAlignmentQ
<
Problem
>();
static
constexpr
index_t
kAlignmentK
=
kPadHeadDimQ
?
1
:
Policy
::
template
GetAlignmentK
<
Problem
>();
static
constexpr
index_t
kAlignmentV
=
[]()
{
if
constexpr
(
std
::
is_same_v
<
VLayout
,
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
)
return
kPadHeadDimV
?
1
:
Policy
::
template
GetAlignmentV
<
Problem
>();
else
return
kPadSeqLenK
?
1
:
Policy
::
template
GetAlignmentV
<
Problem
>();
}();
static
constexpr
index_t
kAlignmentOacc
=
kPadHeadDimV
?
1
:
Policy
::
template
GetAlignmentOacc
<
Problem
>();
static
constexpr
index_t
kAlignmentBias
=
kPadSeqLenK
?
1
:
Policy
::
template
GetAlignmentBias
<
Problem
>();
static
constexpr
index_t
kBlockPerCu
=
[]()
{
if
constexpr
(
Problem
::
kBlockPerCu
!=
-
1
)
return
Problem
::
kBlockPerCu
;
else
{
if
constexpr
(
kQKHeaddim
<=
32
)
{
return
2
;
}
else
if
constexpr
(
kQKHeaddim
<=
64
)
{
return
3
;
}
else
if
constexpr
(
kQKHeaddim
<=
128
)
{
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
return
1
;
else
return
2
;
}
else
if
constexpr
(
kQKHeaddim
<=
256
)
{
return
1
;
}
}
}();
static
constexpr
const
char
*
name
=
"qr_nwarp_sshuffle"
;
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize
()
{
return
Policy
::
template
GetSmemSize
<
Problem
>();
}
template
<
typename
QDramBlockWindowTmp
,
typename
KDramBlockWindowLengths
,
typename
KPageBlockNavigator
,
typename
VDramBlockWindowLengths
,
typename
VPageBlockNavigator
,
typename
BiasDramBlockWindowTmp
,
typename
LSEaccDramBlockWindowTmp
,
typename
QElementFunction
,
typename
KElementFunction
,
typename
VElementFunction
,
typename
BiasElementFunction
,
typename
LSEaccElementFunction
,
typename
SAccElementFunction
,
typename
PComputeElementFunction
,
typename
OAccElementFunction
,
typename
PositionEncoding
>
CK_TILE_HOST_DEVICE
auto
operator
()(
const
QDramBlockWindowTmp
&
q_dram_block_window_tmp
,
// M0*K0 tile
const
QElementFunction
&
q_element_func
,
const
KDramBlockWindowLengths
&
k_dram_block_window_lengths
,
// N0*K0 tile
const
KPageBlockNavigator
&
k_page_block_navigator
,
const
KElementFunction
&
k_element_func
,
const
VDramBlockWindowLengths
&
v_dram_block_window_lengths
,
// N1*K1 tile
const
VPageBlockNavigator
&
v_page_block_navigator
,
const
VElementFunction
&
v_element_func
,
const
BiasDramBlockWindowTmp
&
bias_dram_block_window_tmp
,
// M0*N0 tile
const
BiasElementFunction
&
bias_element_func
,
LSEaccDramBlockWindowTmp
&
lse_acc_dram_window_tmp
,
// M0*1 tile
const
LSEaccElementFunction
&
lse_acc_element_func
,
const
SAccElementFunction
&
s_acc_element_func
,
const
PComputeElementFunction
&
p_compute_element_func
,
const
OAccElementFunction
&
o_acc_element_func
,
index_t
num_splits
,
index_t
i_split
,
FmhaMask
mask
,
PositionEncoding
position_encoding
,
float
scale_s
,
index_t
kv_l2p_offset
,
// logical-to-physical offset of seqlen_k coordinate
void
*
smem_ptr
)
const
{
static_assert
(
std
::
is_same_v
<
QDataType
,
remove_cvref_t
<
typename
QDramBlockWindowTmp
::
DataType
>>
&&
std
::
is_same_v
<
KDataType
,
remove_cvref_t
<
typename
KPageBlockNavigator
::
DataType
>>
&&
std
::
is_same_v
<
VDataType
,
remove_cvref_t
<
typename
VPageBlockNavigator
::
DataType
>>
,
"wrong!"
);
static_assert
(
kM0
==
QDramBlockWindowTmp
{}.
get_window_lengths
()[
number
<
0
>
{}]
&&
kSubQKHeaddim
==
QDramBlockWindowTmp
{}.
get_window_lengths
()[
number
<
1
>
{}]
&&
kN0
==
KDramBlockWindowLengths
{}[
number
<
0
>
{}]
&&
kK0
==
KDramBlockWindowLengths
{}[
number
<
1
>
{}]
&&
kN1
==
VDramBlockWindowLengths
{}[
number
<
0
>
{}]
&&
kK1
==
VDramBlockWindowLengths
{}[
number
<
1
>
{}]
&&
kM0
==
BiasDramBlockWindowTmp
{}.
get_window_lengths
()[
number
<
0
>
{}]
&&
kN0
==
BiasDramBlockWindowTmp
{}.
get_window_lengths
()[
number
<
1
>
{}],
"wrong!"
);
// Q tile in LDS
QDataType
*
q_lds_ptr
=
static_cast
<
QDataType
*>
(
static_cast
<
void
*>
(
static_cast
<
char
*>
(
smem_ptr
)));
auto
q_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
q_lds_ptr
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>());
// K tile in LDS
KDataType
*
k_lds_ptr
=
static_cast
<
KDataType
*>
(
static_cast
<
void
*>
(
static_cast
<
char
*>
(
smem_ptr
)));
auto
k_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
k_lds_ptr
,
Policy
::
template
MakeKLdsBlockDescriptor
<
Problem
>());
auto
k_lds_window
=
make_tile_window
(
k_lds
,
make_tuple
(
number
<
kN0
>
{},
number
<
kK0
>
{}),
{
0
,
0
});
// V tile in LDS
auto
v_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
reinterpret_cast
<
VDataType
*>
(
static_cast
<
char
*>
(
smem_ptr
)
+
max
(
Policy
::
template
GetSmemSizeQ
<
Problem
>(),
Policy
::
template
GetSmemSizeK
<
Problem
>())),
Policy
::
template
MakeVLdsBlockDescriptor
<
Problem
>());
auto
v_lds_window
=
make_tile_window
(
v_lds
,
Policy
::
template
MakeVLdsBlockDescriptor
<
Problem
>().
get_lengths
(),
{
0
,
0
});
// S tile in LDS
auto
s_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
reinterpret_cast
<
SaccDataType
*>
(
reinterpret_cast
<
char
*>
(
smem_ptr
)
+
max
(
Policy
::
template
GetSmemSizeQ
<
Problem
>(),
Policy
::
template
GetSmemSizeK
<
Problem
>())),
Policy
::
template
MakeSLdsBlockDescriptor
<
Problem
>());
auto
s_write_lds_window
=
make_tile_window
(
s_lds
,
Policy
::
template
MakeSLdsBlockDescriptor
<
Problem
>().
get_lengths
(),
{
0
,
0
});
auto
s_read_lds_window
=
make_tile_window
(
s_lds
,
Policy
::
template
MakeSLdsBlockDescriptor
<
Problem
>().
get_lengths
(),
{
0
,
0
},
Policy
::
template
MakeSRegTileDistribution
<
Problem
>());
// Block GEMM
constexpr
auto
gemm_0
=
Policy
::
template
GetQKBlockGemm
<
Problem
>();
constexpr
auto
gemm_1
=
Policy
::
template
GetKVBlockGemm
<
Problem
>();
auto
q_dram_window
=
make_tile_window
(
q_dram_block_window_tmp
.
get_bottom_tensor_view
(),
q_dram_block_window_tmp
.
get_window_lengths
(),
q_dram_block_window_tmp
.
get_window_origin
(),
Policy
::
template
MakeQDramTileDistribution
<
Problem
>());
// load Q here, will store Q into LDS to maximize throughput
auto
origin_q
=
load_tile
(
q_dram_window
);
using
SaccBlockTileType
=
decltype
(
gemm_0
.
MakeCBlockTile
());
auto
s_acc
=
SaccBlockTileType
{};
// reduction function for softmax
const
auto
f_max
=
[](
auto
e0
,
auto
e1
)
{
return
max
(
e0
,
e1
);
};
const
auto
f_sum
=
[](
auto
e0
,
auto
e1
)
{
return
e0
+
e1
;
};
using
OaccBlockTileType
=
decltype
(
gemm_1
.
MakeCBlockTile
());
auto
o_acc
=
OaccBlockTileType
{};
// infer Sacc, S, P, M, L, Oacc type
using
SBlockTileType
=
decltype
(
cast_tile
<
SMPLComputeDataType
>
(
o_acc
));
using
MLBlockTileType
=
decltype
(
block_tile_reduce
<
SMPLComputeDataType
>
(
SBlockTileType
{},
sequence
<
1
>
{},
f_max
,
SMPLComputeDataType
{
0
}));
// init M, L
auto
m
=
MLBlockTileType
{};
auto
l
=
MLBlockTileType
{};
clear_tile
(
o_acc
);
set_tile
(
m
,
-
numeric
<
SMPLComputeDataType
>::
infinity
());
clear_tile
(
l
);
const
auto
q_origin
=
q_dram_window
.
get_window_origin
();
const
auto
[
logical_seqlen_k_start
,
logical_seqlen_k_end
]
=
mask
.
GetTileRangeAlongX
(
q_origin
.
at
(
number
<
0
>
{}),
number
<
kM0
>
{},
number
<
kN0
>
{},
num_splits
,
i_split
);
// check early exit if no work to do
if
constexpr
(
FmhaMask
::
IsMasking
||
kPadSeqLenK
||
kHasUnevenSplits
)
{
const
index_t
logical_num_total_loop
=
integer_divide_ceil
(
logical_seqlen_k_end
-
logical_seqlen_k_start
,
kN0
);
if
(
logical_num_total_loop
<=
0
)
{
if
constexpr
(
kStoreLSE
)
{
auto
lse_acc
=
make_static_distributed_tensor
<
LSEDataType
>
(
m
.
get_tile_distribution
());
set_tile
(
lse_acc
,
-
numeric
<
SMPLComputeDataType
>::
infinity
());
if
(
get_thread_local_1d_id
()
<
kM0
)
{
store_tile
(
lse_acc_dram_window_tmp
,
tile_elementwise_in
(
lse_acc_element_func
,
lse_acc
));
}
}
// Note: here occ are all cleard, return it
// Note: q loaded but no fence, ignore it.
return
o_acc
;
}
}
const
index_t
physical_seqlen_k_start
=
logical_seqlen_k_start
+
kv_l2p_offset
;
const
index_t
physical_seqlen_k_end
=
logical_seqlen_k_end
+
kv_l2p_offset
;
// make sure the first tile is completely located in page-block (page-block size should be
// divisible by kN0)
// relationship between each *_start variables: aligned_physical_seqlen_k_start <=
// physical_seqlen_k_start, logical_seqlen_k_start <= physical_seqlen_k_start
const
index_t
aligned_physical_seqlen_k_start
=
[
&
,
physical_seqlen_k_start_
=
physical_seqlen_k_start
]
{
if
constexpr
(
kIsPagedKV
)
{
return
kN0
*
integer_divide_floor
(
physical_seqlen_k_start_
,
kN0
);
}
else
{
return
physical_seqlen_k_start_
;
}
}();
const
index_t
num_total_loop
=
integer_divide_ceil
(
physical_seqlen_k_end
-
aligned_physical_seqlen_k_start
,
kN0
);
auto
[
i_page_block_k
,
k_dram_block_window
]
=
k_page_block_navigator
.
make_tile_window
(
k_dram_block_window_lengths
,
{
aligned_physical_seqlen_k_start
,
0
});
const
auto
bias_origin
=
bias_dram_block_window_tmp
.
get_window_origin
();
auto
bias_dram_window
=
make_tile_window
(
bias_dram_block_window_tmp
.
get_bottom_tensor_view
(),
bias_dram_block_window_tmp
.
get_window_lengths
(),
{
bias_origin
.
at
(
number
<
0
>
{}),
logical_seqlen_k_start
-
(
physical_seqlen_k_start
-
aligned_physical_seqlen_k_start
)},
// M/N
Policy
::
template
MakeBiasDramTileDistribution
<
decltype
(
gemm_0
)>());
auto
[
i_page_block_v
,
v_dram_window
]
=
v_page_block_navigator
.
make_tile_window
(
v_dram_block_window_lengths
,
{
0
,
aligned_physical_seqlen_k_start
},
// TODO: hdim split?
Policy
::
template
MakeVDramTileDistribution
<
Problem
>());
// store Q into LDS
__builtin_amdgcn_sched_barrier
(
0
);
auto
q_lds_window_for_store
=
make_tile_window
(
q_lds
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>().
get_lengths
(),
{
0
,
0
});
store_tile
(
q_lds_window_for_store
,
origin_q
);
__builtin_amdgcn_sched_barrier
(
0
);
// load Q from LDS
__builtin_amdgcn_sched_barrier
(
0
);
auto
q_lds_window_for_load
=
make_tile_window
(
q_lds
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>().
get_lengths
(),
{
0
,
0
},
Policy
::
template
MakeQRegTileDistribution
<
Problem
,
decltype
(
gemm_0
)>());
block_sync_lds
();
auto
q
=
load_tile
(
q_lds_window_for_load
);
__builtin_amdgcn_sched_barrier
(
0
);
auto
q_tile
=
tile_elementwise_in
(
q_element_func
,
q
);
// prefetch K tile
index_t
i_total_loops
=
0
;
constexpr
index_t
k0_loops
=
kQKHeaddim
/
kK0
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
static_assert
(
2
<=
k0_loops
);
static_assert
(
1
<=
k1_loops
);
auto
k_dram_window
=
make_tile_window
(
k_dram_block_window
,
Policy
::
template
MakeKDramTileDistribution
<
Problem
>());
// K DRAM tile window for
// load the first tile of the first iteration and store to LDS
auto
k_block_tile
=
load_tile
(
k_dram_window
);
// moving k_dram_window is an in-page-block operation, so there is
// no need to invoke k_page_block_navigator.move_tile_window() here.
move_tile_window
(
k_dram_window
,
{
0
,
kK0
});
store_tile
(
k_lds_window
,
tile_elementwise_in
(
k_element_func
,
k_block_tile
));
do
{
// STAGE 1, QK gemm
clear_tile
(
s_acc
);
// initialize C
// load the second tile of the first iteration
k_block_tile
=
load_tile
(
k_dram_window
);
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
{
__builtin_amdgcn_sched_barrier
(
0
);
// prevent from messing up the order of global loads
}
const
auto
bias_tile
=
load_tile
(
bias_dram_window
);
// load bias tile
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
{
__builtin_amdgcn_sched_barrier
(
0
);
// prevent from messing up the order of global loads
}
if
constexpr
(
k0_loops
>
2
)
{
static_for
<
0
,
k0_loops
-
2
,
1
>
{}([
&
](
auto
i_k0
)
{
block_sync_lds
();
gemm_0
(
s_acc
,
get_slice_tile
(
q_tile
,
sequence
<
0
,
i_k0
*
kK0
>
{},
sequence
<
kM0
,
(
i_k0
+
1
)
*
kK0
>
{}),
k_lds_window
);
block_sync_lds
();
move_tile_window
(
k_dram_window
,
{
0
,
kK0
});
store_tile
(
k_lds_window
,
tile_elementwise_in
(
k_element_func
,
k_block_tile
));
// LDS write i + 1
k_block_tile
=
load_tile
(
k_dram_window
);
// global read i + 2
});
}
const
auto
v_prefetch
=
load_tile
(
v_dram_window
);
// prefetch load v tile
{
// tail
block_sync_lds
();
gemm_0
(
s_acc
,
get_slice_tile
(
q_tile
,
sequence
<
0
,
(
k0_loops
-
2
)
*
kK0
>
{},
sequence
<
kM0
,
(
k0_loops
-
1
)
*
kK0
>
{}),
k_lds_window
);
block_sync_lds
();
store_tile
(
k_lds_window
,
tile_elementwise_in
(
k_element_func
,
k_block_tile
));
block_sync_lds
();
gemm_0
(
s_acc
,
get_slice_tile
(
q_tile
,
sequence
<
0
,
(
k0_loops
-
1
)
*
kK0
>
{},
sequence
<
kM0
,
k0_loops
*
kK0
>
{}),
k_lds_window
);
}
// STAGE 2, scale_s, add bias, mask, softmax
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
{
s_acc
=
tile_elementwise_in
(
s_acc_element_func
,
s_acc
);
tile_elementwise_inout
([
&
scale_s
](
auto
&
x
)
{
x
=
x
*
scale_s
;
},
s_acc
);
tile_elementwise_inout
(
[
&
](
auto
&
x
,
const
auto
&
y
)
{
#if !CK_TILE_FMHA_FWD_FAST_EXP2
x
+=
type_convert
<
SaccDataType
>
(
bias_element_func
(
y
));
#else
x
+=
log2e_v
<
SaccDataType
>
*
type_convert
<
SaccDataType
>
(
bias_element_func
(
y
));
#endif
},
s_acc
,
bias_tile
);
}
else
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ALIBI
)
{
const
auto
k_origin
=
k_page_block_navigator
.
to_global_window_origin
(
i_page_block_k
,
k_dram_block_window
.
get_window_origin
());
constexpr
auto
s_spans
=
decltype
(
s_acc
)
::
get_distributed_spans
();
s_acc
=
tile_elementwise_in
(
s_acc_element_func
,
s_acc
);
sweep_tile_span
(
s_spans
[
number
<
0
>
{}],
[
&
](
auto
idx0
)
{
sweep_tile_span
(
s_spans
[
number
<
1
>
{}],
[
&
](
auto
idx1
)
{
const
auto
tile_idx
=
get_x_indices_from_distributed_indices
(
s_acc
.
get_tile_distribution
(),
make_tuple
(
idx0
,
idx1
));
const
auto
row
=
q_origin
.
at
(
number
<
0
>
{})
+
tile_idx
.
at
(
number
<
0
>
{});
const
auto
col
=
k_origin
.
at
(
number
<
0
>
{})
+
tile_idx
.
at
(
number
<
1
>
{});
constexpr
auto
i_j_idx
=
make_tuple
(
idx0
,
idx1
);
s_acc
(
i_j_idx
)
*=
scale_s
;
// position_encoding accept only logical coordinates, do conversion here
position_encoding
.
update
(
s_acc
(
i_j_idx
),
row
,
col
-
kv_l2p_offset
);
});
});
}
else
{
s_acc
=
tile_elementwise_in
(
s_acc_element_func
,
s_acc
);
#if !CK_TILE_FMHA_FWD_FAST_EXP2
tile_elementwise_inout
([
&
scale_s
](
auto
&
x
)
{
x
=
x
*
scale_s
;
},
s_acc
);
#endif
}
move_tile_window
(
bias_dram_window
,
{
0
,
kN0
});
/// TODO: only check in first/last iteration without increasing code size
if
constexpr
(
kHasUnevenSplits
)
{
const
auto
k_origin
=
k_page_block_navigator
.
to_global_window_origin
(
i_page_block_k
,
k_dram_block_window
.
get_window_origin
());
set_tile_if
(
s_acc
,
-
numeric
<
SMPLComputeDataType
>::
infinity
(),
[
&
,
physical_seqlen_k_start_
=
physical_seqlen_k_start
,
physical_seqlen_k_end_
=
physical_seqlen_k_end
](
auto
tile_idx
)
{
const
auto
col
=
k_origin
.
at
(
number
<
0
>
{})
+
tile_idx
.
at
(
number
<
1
>
{});
if
constexpr
(
kIsPagedKV
)
{
return
col
<
physical_seqlen_k_start_
||
physical_seqlen_k_end_
<=
col
;
}
else
{
return
physical_seqlen_k_end_
<=
col
;
}
});
}
if
constexpr
(
kPadSeqLenK
||
FmhaMask
::
IsMasking
)
{
const
auto
k_origin
=
k_page_block_navigator
.
to_global_window_origin
(
i_page_block_k
,
k_dram_block_window
.
get_window_origin
());
// mask accept only logical coordinates, do conversion here
bool
need_perpixel_check
=
mask
.
IsEdgeTile
(
q_origin
.
at
(
number
<
0
>
{}),
k_origin
.
at
(
number
<
0
>
{})
-
kv_l2p_offset
,
number
<
kM0
>
{},
number
<
kN0
>
{});
if
(
need_perpixel_check
)
{
set_tile_if
(
s_acc
,
-
numeric
<
SMPLComputeDataType
>::
infinity
(),
[
&
](
auto
tile_idx
)
{
const
auto
row
=
q_origin
.
at
(
number
<
0
>
{})
+
tile_idx
.
at
(
number
<
0
>
{});
const
auto
col
=
k_origin
.
at
(
number
<
0
>
{})
+
tile_idx
.
at
(
number
<
1
>
{});
return
mask
.
IsOutOfBound
(
row
,
col
-
kv_l2p_offset
);
});
}
}
__builtin_amdgcn_sched_barrier
(
0
);
// load the first tile for next iteration
if
(
i_total_loops
<
num_total_loop
-
1
)
{
// move K tile windows
i_page_block_k
=
k_page_block_navigator
.
move_tile_window
(
i_page_block_k
,
k_dram_block_window
,
{
kN0
,
0
});
k_dram_window
=
make_tile_window
(
k_dram_block_window
,
Policy
::
template
MakeKDramTileDistribution
<
Problem
>());
// K DRAM tile window
// laod the first tile of the first iteration and store to LDS
k_block_tile
=
load_tile
(
k_dram_window
);
}
__builtin_amdgcn_sched_barrier
(
0
);
const
auto
s
=
cast_tile
<
SMPLComputeDataType
>
(
s_acc
);
// S{j}
// shuffle through LDS so that the tile layout is consistent with required by Gemm1
store_tile
(
s_write_lds_window
,
s
);
block_sync_lds
();
auto
s_new
=
load_tile
(
s_read_lds_window
);
auto
m_local
=
block_tile_reduce
<
SMPLComputeDataType
>
(
s_new
,
sequence
<
1
>
{},
f_max
,
-
numeric
<
SMPLComputeDataType
>::
infinity
());
// m_local = rowmax(S{j})
block_tile_reduce_sync
(
m_local
,
f_max
,
bool_constant
<
false
>
{});
const
auto
m_old
=
m
;
// m{j-1}
tile_elementwise_inout
(
[](
auto
&
e0
,
auto
e1
,
auto
e2
)
{
e0
=
max
(
e1
,
e2
);
},
m
,
m_old
,
m_local
);
// m{j}
auto
p_compute
=
make_static_distributed_tensor
<
SMPLComputeDataType
>
(
s_new
.
get_tile_distribution
());
// Pcompute{j}
static
const
auto
get_validated_m
=
[](
SMPLComputeDataType
raw_m
)
{
/// NOTICE: bias might be materialized mask including -inf values, need
/// consideration
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
||
FmhaMask
::
IsMasking
)
{
return
raw_m
==
-
numeric
<
SMPLComputeDataType
>::
infinity
()
?
type_convert
<
SMPLComputeDataType
>
(
0.
f
)
:
raw_m
;
}
else
{
return
raw_m
;
}
};
constexpr
auto
p_spans
=
decltype
(
p_compute
)
::
get_distributed_spans
();
sweep_tile_span
(
p_spans
[
number
<
0
>
{}],
[
&
](
auto
idx0
)
{
constexpr
auto
i_idx
=
make_tuple
(
idx0
);
#if CK_TILE_FMHA_FWD_FAST_EXP2
auto
row_max
=
scale_s
*
get_validated_m
(
m
[
i_idx
]);
#endif
sweep_tile_span
(
p_spans
[
number
<
1
>
{}],
[
&
](
auto
idx1
)
{
constexpr
auto
i_j_idx
=
make_tuple
(
idx0
,
idx1
);
#if CK_TILE_FMHA_FWD_FAST_EXP2
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
||
BiasEnum
==
BlockAttentionBiasEnum
::
ALIBI
)
{
p_compute
(
i_j_idx
)
=
exp2
(
s_new
[
i_j_idx
]
-
get_validated_m
(
m
[
i_idx
]));
}
else
{
p_compute
(
i_j_idx
)
=
exp2
(
scale_s
*
s_new
[
i_j_idx
]
-
row_max
);
}
#else
p_compute
(
i_j_idx
)
=
exp
(
s_new
[
i_j_idx
]
-
get_validated_m
(
m
[
i_idx
]));
#endif
});
});
auto
rowsum_p
=
block_tile_reduce
<
SMPLComputeDataType
>
(
p_compute
,
sequence
<
1
>
{},
f_sum
,
SMPLComputeDataType
{
0
});
// rowsum(Pcompute{j})
block_tile_reduce_sync
(
rowsum_p
,
f_sum
,
bool_constant
<
false
>
{});
const
auto
p
=
cast_tile
<
PDataType
>
(
tile_elementwise_in
(
p_compute_element_func
,
p_compute
));
// l{j}, Oacc{j}
constexpr
auto
o_spans
=
decltype
(
o_acc
)
::
get_distributed_spans
();
sweep_tile_span
(
o_spans
[
number
<
0
>
{}],
[
&
](
auto
idx0
)
{
constexpr
auto
i_idx
=
make_tuple
(
idx0
);
#if CK_TILE_FMHA_FWD_FAST_EXP2
const
auto
tmp
=
[
&
]()
{
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
||
BiasEnum
==
BlockAttentionBiasEnum
::
ALIBI
)
{
return
exp2
(
m_old
[
i_idx
]
-
get_validated_m
(
m
[
i_idx
]));
}
else
{
auto
row_max
=
scale_s
*
get_validated_m
(
m
[
i_idx
]);
return
exp2
(
scale_s
*
m_old
[
i_idx
]
-
row_max
);
}
}();
#else
const
auto
tmp
=
exp
(
m_old
[
i_idx
]
-
get_validated_m
(
m
[
i_idx
]));
#endif
l
(
i_idx
)
=
tmp
*
l
[
i_idx
]
+
rowsum_p
[
i_idx
];
sweep_tile_span
(
o_spans
[
number
<
1
>
{}],
[
&
](
auto
idx1
)
{
constexpr
auto
i_j_idx
=
make_tuple
(
idx0
,
idx1
);
// FIXME: this use different equation from FA v2 paper,
// but produce correc result.
// Is the equation wrong?
o_acc
(
i_j_idx
)
*=
tmp
;
});
});
block_sync_lds
();
if
constexpr
(
std
::
is_same_v
<
VLayout
,
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
auto
v_shuffle_tmp
=
make_static_distributed_tensor
<
VDataType
>
(
Policy
::
template
MakeShuffledVRegBlockDescriptor
<
Problem
>());
shuffle_tile
(
v_shuffle_tmp
,
v_prefetch
);
store_tile
(
v_lds_window
,
tile_elementwise_in
(
v_element_func
,
v_shuffle_tmp
));
// store the prefetch
}
else
{
store_tile
(
v_lds_window
,
tile_elementwise_in
(
v_element_func
,
v_prefetch
));
// store the prefetch
}
i_page_block_v
=
v_page_block_navigator
.
move_tile_window
(
i_page_block_v
,
v_dram_window
,
{
0
,
kK1
});
// STAGE 3, KV gemm
if
constexpr
(
k1_loops
>
1
)
{
static_for
<
0
,
k1_loops
-
1
,
1
>
{}([
&
,
&
i_page_block_v_
=
i_page_block_v
,
&
v_dram_window_
=
v_dram_window
](
auto
i_k1
)
{
const
auto
v
=
load_tile
(
v_dram_window_
);
// load next v
block_sync_lds
();
gemm_1
(
o_acc
,
get_slice_tile
(
p
,
sequence
<
0
,
i_k1
*
kK1
>
{},
sequence
<
kM0
,
(
i_k1
+
1
)
*
kK1
>
{}),
v_lds_window
);
block_sync_lds
();
if
constexpr
(
std
::
is_same_v
<
VLayout
,
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
auto
v_shuffle_tmp
=
make_static_distributed_tensor
<
VDataType
>
(
Policy
::
template
MakeShuffledVRegBlockDescriptor
<
Problem
>());
shuffle_tile
(
v_shuffle_tmp
,
v
);
store_tile
(
v_lds_window
,
tile_elementwise_in
(
v_element_func
,
v_shuffle_tmp
));
// store the prefetch
}
else
{
store_tile
(
v_lds_window
,
tile_elementwise_in
(
v_element_func
,
v
));
// store next v
}
i_page_block_v_
=
v_page_block_navigator
.
move_tile_window
(
i_page_block_v_
,
v_dram_window_
,
{
0
,
kK1
});
});
}
// tail
{
block_sync_lds
();
gemm_1
(
o_acc
,
get_slice_tile
(
p
,
sequence
<
0
,
(
k1_loops
-
1
)
*
kK1
>
{},
sequence
<
kM0
,
k1_loops
*
kK1
>
{}),
v_lds_window
);
block_sync_lds
();
}
__builtin_amdgcn_sched_barrier
(
0
);
// load the first tile for next iteration
if
(
i_total_loops
<
num_total_loop
-
1
)
{
// store the first tile for next iteration to LDS
// moving k_dram_window is an in-page-block operation, so there is
// no need to invoke k_page_block_navigator.move_tile_window() here.
move_tile_window
(
k_dram_window
,
{
0
,
kK0
});
store_tile
(
k_lds_window
,
tile_elementwise_in
(
k_element_func
,
k_block_tile
));
}
}
while
(
++
i_total_loops
<
num_total_loop
);
if
constexpr
(
kStoreLSE
)
{
// store lse acc
auto
lse_acc
=
make_static_distributed_tensor
<
LSEDataType
>
(
m
.
get_tile_distribution
());
constexpr
auto
lse_acc_spans
=
decltype
(
lse_acc
)
::
get_distributed_spans
();
sweep_tile_span
(
lse_acc_spans
[
number
<
0
>
{}],
[
&
,
m_
=
m
,
l_
=
l
](
auto
idx0
)
{
constexpr
auto
i_idx
=
make_tuple
(
idx0
);
#if CK_TILE_FMHA_FWD_FAST_EXP2
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
||
BiasEnum
==
BlockAttentionBiasEnum
::
ALIBI
)
{
lse_acc
(
i_idx
)
=
m_
[
i_idx
]
/
C_LOG2E
+
log
(
l_
[
i_idx
]);
}
else
{
lse_acc
(
i_idx
)
=
m_
[
i_idx
]
*
scale_s
/
C_LOG2E
+
log
(
l_
[
i_idx
]);
}
#else
lse_acc
(
i_idx
)
=
m_
[
i_idx
]
+
log
(
l_
[
i_idx
]);
#endif
});
if
(
get_thread_local_1d_id
()
<
kM0
)
{
store_tile
(
lse_acc_dram_window_tmp
,
tile_elementwise_in
(
lse_acc_element_func
,
lse_acc
));
}
}
// finally, O
constexpr
auto
o_spans
=
decltype
(
o_acc
)
::
get_distributed_spans
();
sweep_tile_span
(
o_spans
[
number
<
0
>
{}],
[
&
](
auto
idx0
)
{
constexpr
auto
i_idx
=
make_tuple
(
idx0
);
const
auto
tmp
=
[
&
]()
{
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
||
FmhaMask
::
IsMasking
)
{
return
l
[
i_idx
]
==
0.
f
?
0.
f
:
1
/
l
[
i_idx
];
}
else
return
1
/
l
[
i_idx
];
}();
sweep_tile_span
(
o_spans
[
number
<
1
>
{}],
[
&
](
auto
idx1
)
{
constexpr
auto
i_j_idx
=
make_tuple
(
idx0
,
idx1
);
o_acc
(
i_j_idx
)
*=
tmp
;
});
});
o_acc
=
tile_elementwise_in
(
o_acc_element_func
,
o_acc
);
return
o_acc
;
}
template
<
typename
QDramBlockWindowTmp
,
typename
KDramBlockWindowLengths
,
typename
KPageBlockNavigator
,
typename
VDramBlockWindowLengths
,
typename
VPageBlockNavigator
,
typename
BiasDramBlockWindowTmp
,
typename
LSEaccDramBlockWindowTmp
,
typename
PositionEncoding
>
CK_TILE_HOST_DEVICE
auto
operator
()(
const
QDramBlockWindowTmp
&
q_dram_block_window_tmp
,
// M0*K0 tile
const
KDramBlockWindowLengths
&
k_dram_block_window_lengths
,
// N0*K0 tile
const
KPageBlockNavigator
&
k_page_block_navigator
,
const
VDramBlockWindowLengths
&
v_dram_block_window_lengths
,
// N1*K1 tile
const
VPageBlockNavigator
&
v_page_block_navigator
,
const
BiasDramBlockWindowTmp
&
bias_dram_block_window_tmp
,
// M0*N0 tile
LSEaccDramBlockWindowTmp
&
lse_acc_dram_block_window_tmp
,
// M0*1 tile
index_t
num_splits
,
index_t
i_split
,
FmhaMask
mask
,
PositionEncoding
position_encoding
,
float
scale_s
,
index_t
kv_l2p_offset
,
// logical-to-physical offset of seqlen_k coordinate
void
*
smem_ptr
)
const
{
return
operator
()(
q_dram_block_window_tmp
,
identity
{},
k_dram_block_window_lengths
,
k_page_block_navigator
,
identity
{},
v_dram_block_window_lengths
,
v_page_block_navigator
,
identity
{},
bias_dram_block_window_tmp
,
identity
{},
lse_acc_dram_block_window_tmp
,
identity
{},
identity
{},
identity
{},
identity
{},
num_splits
,
i_split
,
mask
,
position_encoding
,
scale_s
,
kv_l2p_offset
,
smem_ptr
);
}
};
}
// namespace ck_tile
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_nwarp_sshuffle_qr_ks_vs_default_policy.hpp
0 → 100644
View file @
2a768425
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp"
namespace
ck_tile
{
// This pipeline is qkv all located in LDS
struct
BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVSDefaultPolicy
:
BlockFmhaPipelineQXKSVSCustomPolicy
<
/* QLoadOnce = */
true
,
/* AsyncCopyK = */
false
,
/* AsyncCopyV = */
false
,
/* NumPrefetchK = */
1
,
/* NumPrefetchV = */
1
>
{
using
BasePolicy
=
BlockFmhaPipelineQXKSVSCustomPolicy
<
/* QLoadOnce = */
true
,
/* AsyncCopyK = */
false
,
/* AsyncCopyV = */
false
,
/* NumPrefetchK = */
1
,
/* NumPrefetchV = */
1
>
;
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetAlignmentQ
()
{
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
constexpr
index_t
kMPerBlock
=
Problem
::
BlockFmhaShape
::
kM0
;
constexpr
index_t
kKPerBlock
=
Problem
::
BlockFmhaShape
::
kSubQKHeaddim
;
constexpr
index_t
MaxVectorSize
=
16
/
sizeof
(
typename
Problem
::
QDataType
);
// this should align with MakeQDramTileDistribution()
constexpr
index_t
ElemPerThread
=
(
kMPerBlock
*
kKPerBlock
)
/
kBlockSize
;
static_assert
(
0
<
ElemPerThread
);
return
min
(
ElemPerThread
,
MaxVectorSize
);
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetAlignmentOacc
()
{
using
OaccDataType
=
remove_cvref_t
<
typename
Problem
::
OaccDataType
>
;
return
static_cast
<
index_t
>
(
16
/
sizeof
(
OaccDataType
));
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeQDramTileDistribution
()
{
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
constexpr
index_t
kMPerBlock
=
Problem
::
BlockFmhaShape
::
kM0
;
constexpr
index_t
kKPerBlock
=
Problem
::
BlockFmhaShape
::
kSubQKHeaddim
;
constexpr
index_t
MaxVectorSize
=
16
/
sizeof
(
typename
Problem
::
QDataType
);
constexpr
index_t
ElemPerThread
=
(
kMPerBlock
*
kKPerBlock
)
/
kBlockSize
;
static_assert
(
0
<
ElemPerThread
);
constexpr
index_t
kMaxVecLoad
=
min
(
ElemPerThread
,
MaxVectorSize
);
constexpr
index_t
KPerThread
=
kMaxVecLoad
;
constexpr
index_t
KThreads
=
kKPerBlock
/
KPerThread
;
constexpr
index_t
MThreadPerWarp
=
get_warp_size
()
/
KThreads
;
constexpr
index_t
NumWarps
=
kBlockSize
/
get_warp_size
();
constexpr
index_t
MPerThread
=
kMPerBlock
/
(
MThreadPerWarp
*
NumWarps
);
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
MPerThread
,
NumWarps
,
MThreadPerWarp
>
,
sequence
<
KThreads
,
KPerThread
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
2
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
1
>>
{});
}
template
<
typename
Problem
,
typename
BlockGemm
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeQRegTileDistribution
()
{
return
BasePolicy
::
template
MakeQDramTileDistribution
<
Problem
,
BlockGemm
>();
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetSmemKPackQ
()
{
// TODO: this is for 3d layout
using
QDataType
=
remove_cvref_t
<
typename
Problem
::
QDataType
>
;
return
static_cast
<
index_t
>
(
16
/
sizeof
(
QDataType
));
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeQLdsBlockDescriptor
()
{
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
constexpr
index_t
kMPerBlock
=
Problem
::
BlockFmhaShape
::
kM0
;
constexpr
index_t
kKPerBlock
=
Problem
::
BlockFmhaShape
::
kSubQKHeaddim
;
constexpr
index_t
ElemPerThread
=
(
kMPerBlock
*
kKPerBlock
)
/
kBlockSize
;
static_assert
(
0
<
ElemPerThread
);
constexpr
index_t
kKPack
=
min
(
ElemPerThread
,
GetSmemKPackQ
<
Problem
>
());
constexpr
auto
q_lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
kKPerBlock
/
kKPack
>
{},
number
<
kMPerBlock
>
{},
number
<
kKPack
>
{}),
make_tuple
(
number
<
(
kMPerBlock
+
1
)
*
kKPack
>
{},
number
<
kKPack
>
{},
number
<
1
>
{}),
number
<
kKPack
>
{},
number
<
1
>
{});
constexpr
auto
q_lds_block_desc
=
transform_tensor_descriptor
(
q_lds_block_desc_0
,
make_tuple
(
make_pass_through_transform
(
number
<
kMPerBlock
>
{}),
make_merge_transform
(
make_tuple
(
number
<
kKPerBlock
/
kKPack
>
{},
number
<
kKPack
>
{}))),
make_tuple
(
sequence
<
1
>
{},
sequence
<
0
,
2
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
return
q_lds_block_desc
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetSmemNPackS
()
{
using
SDataType
=
remove_cvref_t
<
typename
Problem
::
SaccDataType
>
;
return
static_cast
<
index_t
>
(
16
/
sizeof
(
SDataType
));
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeSLdsBlockDescriptor
()
{
constexpr
index_t
kMPerBlock
=
Problem
::
BlockFmhaShape
::
kM0
;
constexpr
index_t
kNPerBlock
=
Problem
::
BlockFmhaShape
::
kN0
;
constexpr
index_t
kNPack
=
GetSmemNPackS
<
Problem
>
();
constexpr
auto
s_lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
kNPerBlock
/
kNPack
>
{},
number
<
kMPerBlock
>
{},
number
<
kNPack
>
{}),
make_tuple
(
number
<
(
kMPerBlock
+
1
)
*
kNPack
>
{},
number
<
kNPack
>
{},
number
<
1
>
{}),
number
<
kNPack
>
{},
number
<
1
>
{});
constexpr
auto
s_lds_block_desc
=
transform_tensor_descriptor
(
s_lds_block_desc_0
,
make_tuple
(
make_pass_through_transform
(
number
<
kMPerBlock
>
{}),
make_merge_transform
(
make_tuple
(
number
<
kNPerBlock
/
kNPack
>
{},
number
<
kNPack
>
{}))),
make_tuple
(
sequence
<
1
>
{},
sequence
<
0
,
2
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
return
s_lds_block_desc
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeSRegTileDistribution
()
{
using
BlockGemm
=
remove_cvref_t
<
decltype
(
GetKVBlockGemm
<
Problem
>
())
>
;
constexpr
auto
config
=
BlockGemm
::
Policy
::
template
GetWarpGemmMWarpNWarp
<
Problem
>();
using
WG
=
remove_cvref_t
<
decltype
(
config
.
template
at
<
0
>())
>
;
constexpr
index_t
MWarp
=
config
.
template
at
<
1
>();
constexpr
index_t
NWarp
=
config
.
template
at
<
2
>();
static_assert
(
MWarp
==
1
,
"Check failed!"
);
constexpr
index_t
kMPerBlock
=
Problem
::
BlockFmhaShape
::
kM0
;
constexpr
index_t
kKPerBlock
=
Problem
::
BlockFmhaShape
::
kK1
;
constexpr
index_t
kTileK
=
Problem
::
BlockFmhaShape
::
kN0
;
// K2 is equal to Impl::kABKPerLane * kKIterPerWarpGemm
constexpr
index_t
K3
=
WG
::
kK
/
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
;
constexpr
index_t
K2
=
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
;
constexpr
index_t
K1
=
kKPerBlock
/
(
K2
*
K3
);
constexpr
index_t
K0
=
kTileK
/
kKPerBlock
;
constexpr
index_t
M2
=
WG
::
WarpGemmAttribute
::
Impl
::
kAMLane
;
constexpr
index_t
M1
=
MWarp
;
constexpr
index_t
M0
=
kMPerBlock
/
(
M2
*
M1
);
constexpr
auto
s2_block_dstr_encoding
=
tile_distribution_encoding
<
sequence
<
NWarp
>
,
tuple
<
sequence
<
M0
,
M1
,
M2
>
,
sequence
<
K0
,
K1
,
K2
,
K3
>>
,
tuple
<
sequence
<
1
,
0
>
,
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
1
,
0
>
,
sequence
<
2
,
2
>>
,
sequence
<
1
,
2
,
2
,
2
>
,
sequence
<
0
,
0
,
1
,
3
>>
{};
constexpr
auto
s2_block_dstr
=
make_static_tile_distribution
(
s2_block_dstr_encoding
);
return
s2_block_dstr
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSizeQ
()
{
return
MakeQLdsBlockDescriptor
<
Problem
>
().
get_element_space_size
()
*
sizeof
(
typename
Problem
::
QDataType
);
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSizeK
()
{
return
MakeKLdsBlockDescriptor
<
Problem
>
().
get_element_space_size
()
*
sizeof
(
typename
Problem
::
KDataType
);
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSizeV
()
{
return
MakeVLdsBlockDescriptor
<
Problem
>
().
get_element_space_size
()
*
sizeof
(
typename
Problem
::
VDataType
);
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSizeS
()
{
return
MakeSLdsBlockDescriptor
<
Problem
>
().
get_element_space_size
()
*
sizeof
(
typename
Problem
::
SaccDataType
);
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
ck_tile
::
index_t
GetSmemSize
()
{
return
max
(
GetSmemSizeQ
<
Problem
>
(),
GetSmemSizeK
<
Problem
>
())
+
max
(
GetSmemSizeV
<
Problem
>
(),
GetSmemSizeS
<
Problem
>
());
}
};
}
// namespace ck_tile
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
View file @
2a768425
...
...
@@ -106,28 +106,43 @@ struct BlockFmhaFwdSplitKVPipelineProblem
static
constexpr
index_t
kBlockPerCu
=
Traits
::
kBlockPerCu
;
};
// extract tile size attributes to remove dependency on traits
template
<
typename
OaccDataType_
,
ck_tile
::
index_t
kN1_
>
struct
BlockFmhaSplitKVCombinePipelineTileSizes
{
static
constexpr
index_t
MaxVectorSize
=
16
/
sizeof
(
OaccDataType_
);
static
constexpr
index_t
kN1
=
kN1_
;
static
constexpr
index_t
NThreads
=
kN1
/
MaxVectorSize
;
static
constexpr
index_t
kM0
=
get_warp_size
()
/
NThreads
;
// MThreadPerWarp
};
template
<
typename
LSEDataType_
,
typename
OaccDataType_
,
typename
ODataType_
,
index_t
HeadDimV_
,
index_t
kM0_
,
index_t
kN1_
,
bool
kIsGroupMode_
,
ck_tile
::
index_t
kN1_
,
typename
Traits_
>
struct
BlockFmhaSplitKVCombinePipelineProblem
:
BlockFmhaSplitKVCombinePipelineTileSizes
<
OaccDataType_
,
kN1_
>
{
using
BaseType
=
BlockFmhaSplitKVCombinePipelineTileSizes
<
OaccDataType_
,
kN1_
>
;
using
LSEDataType
=
remove_cvref_t
<
LSEDataType_
>
;
using
OaccDataType
=
remove_cvref_t
<
OaccDataType_
>
;
using
ODataType
=
remove_cvref_t
<
ODataType_
>
;
using
Traits
=
remove_cvref_t
<
Traits_
>
;
static
constexpr
index_t
kNumWarps
=
kM0_
/
(
get_warp_size
()
/
4
);
static
constexpr
index_t
kBlockSize
=
kNumWarps
*
get_warp_size
();
static
constexpr
bool
kIsGroupMode
=
kIsGroupMode_
;
static_assert
(
std
::
is_same_v
<
LSEDataType
,
OaccDataType
>
);
static
constexpr
index_t
kHeadDimV
=
HeadDimV_
;
static
constexpr
index_t
kM0
=
kM0_
;
static
constexpr
index_t
kN1
=
kN1_
;
static
constexpr
bool
kIsGroupMode
=
kIsGroupMode_
;
using
BaseType
::
kM0
;
using
BaseType
::
kN1
;
static_assert
(
kN1
<=
kHeadDimV
&&
kHeadDimV
%
kN1
==
0
);
// attributes from traits
static
constexpr
bool
kPadSeqLenQ
=
Traits
::
kPadSeqLenQ
;
...
...
@@ -136,6 +151,13 @@ struct BlockFmhaSplitKVCombinePipelineProblem
static
constexpr
bool
kDoFp8StaticQuant
=
Traits
::
kDoFp8StaticQuant
;
static
constexpr
index_t
kBlockPerCu
=
Traits
::
kBlockPerCu
;
static
constexpr
index_t
kMaxSplits
=
Traits
::
kMaxSplits
;
static_assert
(
8
<=
kMaxSplits
);
static
constexpr
index_t
kNumWarps
=
4
;
// always use 4 warps for each workgroup
static
constexpr
index_t
kBlockSize
=
kNumWarps
*
get_warp_size
();
static_assert
(
get_warp_size
()
<=
(
kM0
*
kMaxSplits
)
&&
(
kM0
*
kMaxSplits
)
%
get_warp_size
()
==
0
);
};
template
<
typename
QDataType_
,
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
View file @
2a768425
...
...
@@ -41,52 +41,21 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetAlignmentQ
()
{
constexpr
index_t
MaxVectorSize
=
16
/
sizeof
(
typename
Problem
::
QDataType
);
using
BlockGemm
=
remove_cvref_t
<
decltype
(
GetQKBlockGemm
<
Problem
>
())
>
;
constexpr
auto
config
=
BlockGemm
::
Policy
::
template
GetWarpGemmMWarpNWarp
<
Problem
>();
using
WG
=
remove_cvref_t
<
decltype
(
config
.
template
at
<
0
>())
>
;
return
WG
::
kK
/
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
;
return
min
(
MaxVectorSize
,
WG
::
kK
/
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
);
}
template
<
typename
Problem
,
typename
BlockGemm
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeQDramTileDistribution
()
{
constexpr
auto
config
=
BlockGemm
::
Policy
::
template
GetWarpGemmMWarpNWarp
<
Problem
>();
using
WG
=
remove_cvref_t
<
decltype
(
config
.
template
at
<
0
>())
>
;
constexpr
index_t
MWarp
=
config
.
template
at
<
1
>();
constexpr
index_t
kMPerBlock
=
Problem
::
BlockFmhaShape
::
kM0
;
constexpr
index_t
kKPerBlock
=
Problem
::
BlockFmhaShape
::
kSubQKHeaddim
;
constexpr
index_t
K2
=
WG
::
kK
/
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
;
constexpr
index_t
K1
=
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
;
constexpr
index_t
K0
=
kKPerBlock
/
(
K1
*
K2
);
constexpr
index_t
M2
=
WG
::
WarpGemmAttribute
::
Impl
::
kAMLane
;
constexpr
index_t
M1
=
MWarp
;
constexpr
index_t
M0
=
kMPerBlock
/
(
M2
*
M1
);
if
constexpr
(
1
<
Problem
::
kNumGemm0Warps
)
{
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
M0
,
M1
,
M2
>
,
sequence
<
K0
,
K1
,
K2
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
1
,
2
>>
,
sequence
<
1
,
2
,
2
>
,
sequence
<
0
,
0
,
2
>>
{});
}
else
{
static_assert
(
MWarp
==
1
);
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
M0
,
M1
,
M2
>
,
sequence
<
K0
,
K1
,
K2
>>
,
tuple
<
sequence
<
2
,
1
>>
,
tuple
<
sequence
<
1
,
2
>>
,
sequence
<
1
,
2
,
2
>
,
sequence
<
0
,
0
,
2
>>
{});
}
return
BlockGemm
::
template
MakeABlockTileDistribution
<
Problem
::
BlockFmhaShape
::
kM0
,
Problem
::
BlockFmhaShape
::
kSubQKHeaddim
>();
}
template
<
typename
Problem
>
...
...
@@ -105,7 +74,7 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
constexpr
auto
warp_gemm
=
[]()
{
constexpr
index_t
WarpGemmM
=
Problem
::
BlockFmhaShape
::
Gemm0WarpTile
::
at
(
number
<
0
>
{});
static_assert
(
WarpGemmM
==
16
||
WarpGemmM
==
32
);
static_assert
(
WarpGemmM
==
4
||
WarpGemmM
==
16
||
WarpGemmM
==
32
);
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
QDataType
,
half_t
>
&&
std
::
is_same_v
<
typename
Problem
::
KDataType
,
half_t
>
&&
...
...
@@ -113,8 +82,10 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
{
if
constexpr
(
WarpGemmM
==
32
)
return
WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution
{};
else
//
WarpGemmM == 16
else
if
constexpr
(
WarpGemmM
==
16
)
return
WarpGemmMfmaF16F16F32M16N16K16TransposedCDistribution
{};
else
// WarpGemmM == 4
return
WarpGemmMfmaF16F16F32M4N64K16
{};
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
QDataType
,
bf16_t
>
&&
std
::
is_same_v
<
typename
Problem
::
KDataType
,
bf16_t
>
&&
...
...
@@ -122,8 +93,10 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
{
if
constexpr
(
WarpGemmM
==
32
)
return
WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution
{};
else
//
WarpGemmM == 16
else
if
constexpr
(
WarpGemmM
==
16
)
return
WarpGemmMfmaBf16Bf16F32M16N16K16TransposedCDistribution
{};
else
// WarpGemmM == 4
return
WarpGemmMfmaBf16Bf16F32M4N64K16
{};
}
else
if
constexpr
(
std
::
is_same_v
<
typename
Problem
::
QDataType
,
fp8_t
>
&&
std
::
is_same_v
<
typename
Problem
::
KDataType
,
fp8_t
>
&&
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment