Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
f549173b
Commit
f549173b
authored
Jan 01, 2025
by
shengnxu
Browse files
simple gemm2 for gemm1 debuggging
parent
811b75d3
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
2002 additions
and
603 deletions
+2002
-603
include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
...ps/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
+6
-1
include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp
...flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp
+3
-0
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8.inc
...k/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8.inc
+1948
-578
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
...lock/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
+20
-17
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp
...ed_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp
+25
-7
No files found.
include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
View file @
f549173b
...
...
@@ -245,12 +245,13 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
template
<
typename
DQRes
,
typename
GQRes
,
typename
ARes
,
typename
ACoords
,
typename
BRes
,
typename
BCoords
>
template
<
typename
DQRes
,
typename
GQRes
,
typename
SMQRes
,
typename
ARes
,
typename
ACoords
,
typename
BRes
,
typename
BCoords
>
CK_TILE_DEVICE
auto
operator
()(
index_t
row_ids_a_
,
const
DQes
&
res_aq
const
DQes
&
res_dq
,
const
GQRes
&
res_gq
,
const
SMQRes
&
res_smq
,
const
Res
&
res_a
,
const
ACoords
&
cached_coords_a
,
const
BRes
&
res_b
,
...
...
@@ -405,6 +406,10 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
[
s_res_gq1
]
"s"
(
res_gq
[
1
]),
[
s_res_gq2
]
"s"
(
res_gq
[
2
]),
[
s_res_gq3
]
"s"
(
res_gq
[
3
]),
[
s_res_smq0
]
"s"
(
res_smq
[
0
]),
[
s_res_smq1
]
"s"
(
res_smq
[
1
]),
[
s_res_smq2
]
"s"
(
res_smq
[
2
]),
[
s_res_smq3
]
"s"
(
res_smq
[
3
]),
[
s_res_a0
]
"s"
(
res_a
[
0
]),
[
s_res_a1
]
"s"
(
res_a
[
1
]),
[
s_res_a2
]
"s"
(
res_a
[
2
]),
...
...
include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp
View file @
f549173b
...
...
@@ -92,6 +92,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
n
,
// loop along n dim
const
ScaleTensor
&
scale_
,
index_t
tile_offset_dq
,
index_t
tile_offset_b
,
// stride b is fixed to blockKr * blockW, but still can adjust
index_t
tile_offset_half_b
,
//splited load alone K in to 2 part
index_t
tile_offset_o
)
...
...
@@ -102,6 +103,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
const
index_t
tile_stride_b_bytes
=
tile_offset_b
*
sizeof
(
BDataType
);
const
index_t
tile_offset_half_b_bytes
=
tile_offset_half_b
*
sizeof
(
BDataType
);
const
index_t
tile_stride_o_bytes
=
tile_offset_o
*
sizeof
(
ODataType
);
const
index_t
tile_stride_dq_bytes
=
tile_offset_dq
*
sizeof
(
DScaleDataType
);
static_assert
(
ScaleTensor
::
size
()
==
2
);
float
s0
=
scale_
[
number
<
0
>
{}];
...
...
@@ -244,6 +246,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
[
s_tile_os_o
]
"s"
(
tile_stride_o_bytes
),
[
s_tile_os_b_half
]
"s"
(
tile_offset_half_b_bytes
),
[
s_tile_os_b
]
"s"
(
tile_stride_b_bytes
),
[
s_tile_os_dq
]
"s"
(
tile_stride_dq_bytes
),
[
scale_0
]
"v"
(
s0
),
[
scale_1
]
"v"
(
s1
),
[
v_nan_lo
]
"v"
(
nan_lo
),
...
...
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8.inc
View file @
f549173b
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_
BF16
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_
INT8
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_
BF16
# define _UK_MFMA_ "v_mfma_
f
32_16x16x
16_bf16
"
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_
INT8
# define _UK_MFMA_ "v_mfma_
i
32_16x16x
32_i8
"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
...
...
@@ -27,587 +27,1957 @@
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
";-------------------------------------------------------------
\n
"
" s_mov_b32 s52, 0x07060302 ; v_perm
\n
"
" s_mov_b64 s[38:39], exec ; save current exec
\n
"
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base]
\n
"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base]
\n
"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base]
\n
"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base]
\n
"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base]
\n
"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base]
\n
"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base]
\n
"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base]
\n
"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base]
\n
"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base]
\n
"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base]
\n
"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base]
\n
"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base]
\n
"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base]
\n
"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base]
\n
"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base]
\n
"
" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base]
\n
"
" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base]
\n
"
" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base]
\n
"
" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base]
\n
"
" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base]
\n
"
" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base]
\n
"
" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base]
\n
"
" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base]
\n
"
" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base]
\n
"
" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base]
\n
"
" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base]
\n
"
" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base]
\n
"
" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base]
\n
"
" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base]
\n
"
" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base]
\n
"
" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base]
\n
"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base]
\n
"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base]
\n
"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base]
\n
"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base]
\n
"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base]
\n
"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base]
\n
"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base]
\n
"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base]
\n
"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base]
\n
"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base]
\n
"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base]
\n
"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base]
\n
"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base]
\n
"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base]
\n
"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base]
\n
"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base]
\n
"
" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base]
\n
"
" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base]
\n
"
" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base]
\n
"
" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base]
\n
"
" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base]
\n
"
" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base]
\n
"
" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base]
\n
"
" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base]
\n
"
" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base]
\n
"
" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base]
\n
"
" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base]
\n
"
" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base]
\n
"
" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base]
\n
"
" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base]
\n
"
" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base]
\n
"
" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base]
\n
"
" s_waitcnt 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_waitcnt 0
\n
"
"L_start%=:
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]]
\n
"
_UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" v_mul_f32 %[c0], %[scale_0], %[c0]
\n
"
" v_mul_f32 %[c1], %[scale_0], %[c1]
\n
"
" v_mul_f32 %[c2], %[scale_0], %[c2]
\n
"
" v_mul_f32 %[c3], %[scale_0], %[c3]
\n
"
" v_mul_f32 %[c4], %[scale_1], %[c4]
\n
"
" v_mul_f32 %[c5], %[scale_1], %[c5]
\n
"
" v_mul_f32 %[c6], %[scale_1], %[c6]
\n
"
" v_mul_f32 %[c7], %[scale_1], %[c7]
\n
"
" v_mul_f32 %[c8], %[scale_0], %[c8]
\n
"
" v_mul_f32 %[c9], %[scale_0], %[c9]
\n
"
" v_mul_f32 %[c10], %[scale_0], %[c10]
\n
"
" v_mul_f32 %[c11], %[scale_0], %[c11]
\n
"
" v_mul_f32 %[c12], %[scale_1], %[c12]
\n
"
" v_mul_f32 %[c13], %[scale_1], %[c13]
\n
"
" v_mul_f32 %[c14], %[scale_1], %[c14]
\n
"
" v_mul_f32 %[c15], %[scale_1], %[c15]
\n
"
_UK_PK_CVT_
(
"%[c0]"
,
"%[c1]"
,
"%[c0]"
)
_UK_PK_CVT_
(
"%[c2]"
,
"%[c3]"
,
"%[c1]"
)
_UK_PK_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c2]"
)
_UK_PK_CVT_
(
"%[c6]"
,
"%[c7]"
,
"%[c3]"
)
_UK_PK_CVT_
(
"%[c8]"
,
"%[c9]"
,
"%[c4]"
)
_UK_PK_CVT_
(
"%[c10]"
,
"%[c11]"
,
"%[c5]"
)
_UK_PK_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c6]"
)
_UK_PK_CVT_
(
"%[c14]"
,
"%[c15]"
,
"%[c7]"
)
" ;------------------------------
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:0 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 %[c0], %[v_sfl_sld] offset:0 + %[shfl_base]
\n
"
" ds_read_b32 %[c1], %[v_sfl_sld] offset:32 + %[shfl_base]
\n
"
" ds_read_b32 %[c2], %[v_sfl_sld] offset:64 + %[shfl_base]
\n
"
" ds_read_b32 %[c3], %[v_sfl_sld] offset:96 + %[shfl_base]
\n
"
" ds_read_b32 %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base]
\n
"
" ds_read_b32 %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base]
\n
"
" ds_read_b32 %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base]
\n
"
" ds_read_b32 %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], %[c0], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], %[c1], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], %[c2], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], %[c3], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], %[c4], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], %[c5], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], %[c6], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], %[c7], s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_barrier
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]]
\n
"
_UK_MFMA_
" [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mul_f32 %[c16], %[scale_0], %[c16]
\n
"
" v_mul_f32 %[c17], %[scale_0], %[c17]
\n
"
" v_mul_f32 %[c18], %[scale_0], %[c18]
\n
"
" v_mul_f32 %[c19], %[scale_0], %[c19]
\n
"
" v_mul_f32 %[c20], %[scale_1], %[c20]
\n
"
" v_mul_f32 %[c21], %[scale_1], %[c21]
\n
"
" v_mul_f32 %[c22], %[scale_1], %[c22]
\n
"
" v_mul_f32 %[c23], %[scale_1], %[c23]
\n
"
" v_mul_f32 %[c24], %[scale_0], %[c24]
\n
"
" v_mul_f32 %[c25], %[scale_0], %[c25]
\n
"
" v_mul_f32 %[c26], %[scale_0], %[c26]
\n
"
" v_mul_f32 %[c27], %[scale_0], %[c27]
\n
"
" v_mul_f32 %[c28], %[scale_1], %[c28]
\n
"
" v_mul_f32 %[c29], %[scale_1], %[c29]
\n
"
" v_mul_f32 %[c30], %[scale_1], %[c30]
\n
"
" v_mul_f32 %[c31], %[scale_1], %[c31]
\n
"
_UK_PK_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c16]"
)
_UK_PK_CVT_
(
"%[c18]"
,
"%[c19]"
,
"%[c17]"
)
_UK_PK_CVT_
(
"%[c20]"
,
"%[c21]"
,
"%[c18]"
)
_UK_PK_CVT_
(
"%[c22]"
,
"%[c23]"
,
"%[c19]"
)
_UK_PK_CVT_
(
"%[c24]"
,
"%[c25]"
,
"%[c20]"
)
_UK_PK_CVT_
(
"%[c26]"
,
"%[c27]"
,
"%[c21]"
)
_UK_PK_CVT_
(
"%[c28]"
,
"%[c29]"
,
"%[c22]"
)
_UK_PK_CVT_
(
"%[c30]"
,
"%[c31]"
,
"%[c23]"
)
" ;------------------------------
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:0 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base]
\n
"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 %[c16], %[v_sfl_sld] offset:0 + %[shfl_base]
\n
"
" ds_read_b32 %[c17], %[v_sfl_sld] offset:32 + %[shfl_base]
\n
"
" ds_read_b32 %[c18], %[v_sfl_sld] offset:64 + %[shfl_base]
\n
"
" ds_read_b32 %[c19], %[v_sfl_sld] offset:96 + %[shfl_base]
\n
"
" ds_read_b32 %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base]
\n
"
" ds_read_b32 %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base]
\n
"
" ds_read_b32 %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base]
\n
"
" ds_read_b32 %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base]
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, %[s_execflag_0]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o0], %[c16], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_1]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o1], %[c17], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_2]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o2], %[c18], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_3]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o3], %[c19], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_4]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o4], %[c20], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_5]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o5], %[c21], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_6]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o6], %[c22], s[8:9]
\n
"
" s_mov_b64 exec, %[s_execflag_7]
\n
"
_UK_ATOMIC_ADD_
" %[v_os_o7], %[c23], s[8:9]
\n
"
" s_mov_b64 exec, s[38:39]
\n
"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k--
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 0
\n
"
" s_cbranch_scc0 L_end%=
\n
"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond
\n
"
" s_cselect_b32 s86, %[s_tile_os_b], 0
\n
"
" s_add_u32 s12, s86, s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s8, %[s_tile_os_o], s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_branch L_start%=
\n
"
"L_end%=:
\n
"
" s_waitcnt vmcnt(24)
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v128, v128
\n
"
" v_mul_f32 v55, v129, v129
\n
"
" v_mul_f32 v56, v130, v130
\n
"
" v_mul_f32 v57, v131, v131
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v128
\n
"
" v_mul_f32 v55, v55, v129
\n
"
" v_mul_f32 v56, v56, v130
\n
"
" v_mul_f32 v57, v57, v131
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v128, v128, v54
\n
"
" v_mul_f32 v129, v129, v55
\n
"
" v_mul_f32 v130, v130, v56
\n
"
" v_mul_f32 v131, v131, v57
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v132, v132
\n
"
" v_mul_f32 v55, v133, v133
\n
"
" v_mul_f32 v56, v134, v134
\n
"
" v_mul_f32 v57, v135, v135
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v132
\n
"
" v_mul_f32 v55, v55, v133
\n
"
" v_mul_f32 v56, v56, v134
\n
"
" v_mul_f32 v57, v57, v135
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v132, v132, v54
\n
"
" v_mul_f32 v133, v133, v55
\n
"
" v_mul_f32 v134, v134, v56
\n
"
" v_mul_f32 v135, v135, v57
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v136, v136
\n
"
" v_mul_f32 v55, v137, v137
\n
"
" v_mul_f32 v56, v138, v138
\n
"
" v_mul_f32 v57, v139, v139
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v136
\n
"
" v_mul_f32 v55, v55, v137
\n
"
" v_mul_f32 v56, v56, v138
\n
"
" v_mul_f32 v57, v57, v139
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v136, v136, v54
\n
"
" v_mul_f32 v137, v137, v55
\n
"
" v_mul_f32 v138, v138, v56
\n
"
" v_mul_f32 v139, v139, v57
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v140, v140
\n
"
" v_mul_f32 v55, v141, v141
\n
"
" v_mul_f32 v56, v142, v142
\n
"
" v_mul_f32 v57, v143, v143
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v140
\n
"
" v_mul_f32 v55, v55, v141
\n
"
" v_mul_f32 v56, v56, v142
\n
"
" v_mul_f32 v57, v57, v143
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v140, v140, v54
\n
"
" v_mul_f32 v141, v141, v55
\n
"
" v_mul_f32 v142, v142, v56
\n
"
" v_mul_f32 v143, v143, v57
\n
"
" s_waitcnt vmcnt(24)
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v144, v144
\n
"
" v_mul_f32 v55, v145, v145
\n
"
" v_mul_f32 v56, v146, v146
\n
"
" v_mul_f32 v57, v147, v147
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v144
\n
"
" v_mul_f32 v55, v55, v145
\n
"
" v_mul_f32 v56, v56, v146
\n
"
" v_mul_f32 v57, v57, v147
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v144, v144, v54
\n
"
" v_mul_f32 v145, v145, v55
\n
"
" v_mul_f32 v146, v146, v56
\n
"
" v_mul_f32 v147, v147, v57
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v148, v148
\n
"
" v_mul_f32 v55, v149, v149
\n
"
" v_mul_f32 v56, v150, v150
\n
"
" v_mul_f32 v57, v151, v151
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v148
\n
"
" v_mul_f32 v55, v55, v149
\n
"
" v_mul_f32 v56, v56, v150
\n
"
" v_mul_f32 v57, v57, v151
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v148, v148, v54
\n
"
" v_mul_f32 v149, v149, v55
\n
"
" v_mul_f32 v150, v150, v56
\n
"
" v_mul_f32 v151, v151, v57
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v152, v152
\n
"
" v_mul_f32 v55, v153, v153
\n
"
" v_mul_f32 v56, v154, v154
\n
"
" v_mul_f32 v57, v155, v155
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v152
\n
"
" v_mul_f32 v55, v55, v153
\n
"
" v_mul_f32 v56, v56, v154
\n
"
" v_mul_f32 v57, v57, v155
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v152, v152, v54
\n
"
" v_mul_f32 v153, v153, v55
\n
"
" v_mul_f32 v154, v154, v56
\n
"
" v_mul_f32 v155, v155, v57
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v156, v156
\n
"
" v_mul_f32 v55, v157, v157
\n
"
" v_mul_f32 v56, v158, v158
\n
"
" v_mul_f32 v57, v159, v159
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v156
\n
"
" v_mul_f32 v55, v55, v157
\n
"
" v_mul_f32 v56, v56, v158
\n
"
" v_mul_f32 v57, v57, v159
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" s_add_u32 s12, %[s_tile_os_b_half], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v156, v156, v54
\n
"
" v_mul_f32 v157, v157, v55
\n
"
" v_mul_f32 v158, v158, v56
\n
"
" v_mul_f32 v159, v159, v57
\n
"
" s_waitcnt vmcnt(24)
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v160, v160
\n
"
" v_mul_f32 v55, v161, v161
\n
"
" v_mul_f32 v56, v162, v162
\n
"
" v_mul_f32 v57, v163, v163
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v160
\n
"
" v_mul_f32 v55, v55, v161
\n
"
" v_mul_f32 v56, v56, v162
\n
"
" v_mul_f32 v57, v57, v163
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v160, v160, v54
\n
"
" v_mul_f32 v161, v161, v55
\n
"
" v_mul_f32 v162, v162, v56
\n
"
" v_mul_f32 v163, v163, v57
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v164, v164
\n
"
" v_mul_f32 v55, v165, v165
\n
"
" v_mul_f32 v56, v166, v166
\n
"
" v_mul_f32 v57, v167, v167
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v164
\n
"
" v_mul_f32 v55, v55, v165
\n
"
" v_mul_f32 v56, v56, v166
\n
"
" v_mul_f32 v57, v57, v167
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v164, v164, v54
\n
"
" v_mul_f32 v165, v165, v55
\n
"
" v_mul_f32 v166, v166, v56
\n
"
" v_mul_f32 v167, v167, v57
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v168, v168
\n
"
" v_mul_f32 v55, v169, v169
\n
"
" v_mul_f32 v56, v170, v170
\n
"
" v_mul_f32 v57, v171, v171
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v168
\n
"
" v_mul_f32 v55, v55, v169
\n
"
" v_mul_f32 v56, v56, v170
\n
"
" v_mul_f32 v57, v57, v171
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v168, v168, v54
\n
"
" v_mul_f32 v169, v169, v55
\n
"
" v_mul_f32 v170, v170, v56
\n
"
" v_mul_f32 v171, v171, v57
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v172, v172
\n
"
" v_mul_f32 v55, v173, v173
\n
"
" v_mul_f32 v56, v174, v174
\n
"
" v_mul_f32 v57, v175, v175
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v172
\n
"
" v_mul_f32 v55, v55, v173
\n
"
" v_mul_f32 v56, v56, v174
\n
"
" v_mul_f32 v57, v57, v175
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v172, v172, v54
\n
"
" v_mul_f32 v173, v173, v55
\n
"
" v_mul_f32 v174, v174, v56
\n
"
" v_mul_f32 v175, v175, v57
\n
"
" s_waitcnt vmcnt(24)
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v176, v176
\n
"
" v_mul_f32 v55, v177, v177
\n
"
" v_mul_f32 v56, v178, v178
\n
"
" v_mul_f32 v57, v179, v179
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v176
\n
"
" v_mul_f32 v55, v55, v177
\n
"
" v_mul_f32 v56, v56, v178
\n
"
" v_mul_f32 v57, v57, v179
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v176, v176, v54
\n
"
" v_mul_f32 v177, v177, v55
\n
"
" v_mul_f32 v178, v178, v56
\n
"
" v_mul_f32 v179, v179, v57
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v180, v180
\n
"
" v_mul_f32 v55, v181, v181
\n
"
" v_mul_f32 v56, v182, v182
\n
"
" v_mul_f32 v57, v183, v183
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v180
\n
"
" v_mul_f32 v55, v55, v181
\n
"
" v_mul_f32 v56, v56, v182
\n
"
" v_mul_f32 v57, v57, v183
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v180, v180, v54
\n
"
" v_mul_f32 v181, v181, v55
\n
"
" v_mul_f32 v182, v182, v56
\n
"
" v_mul_f32 v183, v183, v57
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v184, v184
\n
"
" v_mul_f32 v55, v185, v185
\n
"
" v_mul_f32 v56, v186, v186
\n
"
" v_mul_f32 v57, v187, v187
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v184
\n
"
" v_mul_f32 v55, v55, v185
\n
"
" v_mul_f32 v56, v56, v186
\n
"
" v_mul_f32 v57, v57, v187
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v184, v184, v54
\n
"
" v_mul_f32 v185, v185, v55
\n
"
" v_mul_f32 v186, v186, v56
\n
"
" v_mul_f32 v187, v187, v57
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v188, v188
\n
"
" v_mul_f32 v55, v189, v189
\n
"
" v_mul_f32 v56, v190, v190
\n
"
" v_mul_f32 v57, v191, v191
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v188
\n
"
" v_mul_f32 v55, v55, v189
\n
"
" v_mul_f32 v56, v56, v190
\n
"
" v_mul_f32 v57, v57, v191
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v188, v188, v54
\n
"
" v_mul_f32 v189, v189, v55
\n
"
" v_mul_f32 v190, v190, v56
\n
"
" v_mul_f32 v191, v191, v57
\n
"
" v_mul_f32 v128, v18, v128 row_newbcast:0
\n
"
" v_mul_f32 v129, v18, v129 row_newbcast:1
\n
"
" v_mul_f32 v130, v18, v130 row_newbcast:2
\n
"
" v_mul_f32 v131, v18, v131 row_newbcast:3
\n
"
" v_mul_f32 v132, v18, v132 row_newbcast:0
\n
"
" v_mul_f32 v133, v18, v133 row_newbcast:1
\n
"
" v_mul_f32 v134, v18, v134 row_newbcast:2
\n
"
" v_mul_f32 v135, v18, v135 row_newbcast:3
\n
"
" v_mul_f32 v136, v18, v136 row_newbcast:4
\n
"
" v_mul_f32 v137, v18, v137 row_newbcast:5
\n
"
" v_mul_f32 v138, v18, v138 row_newbcast:6
\n
"
" v_mul_f32 v139, v18, v139 row_newbcast:7
\n
"
" v_mul_f32 v140, v18, v140 row_newbcast:4
\n
"
" v_mul_f32 v141, v18, v141 row_newbcast:5
\n
"
" v_mul_f32 v142, v18, v142 row_newbcast:6
\n
"
" v_mul_f32 v143, v18, v143 row_newbcast:7
\n
"
" v_mul_f32 v144, v18, v144 row_newbcast:8
\n
"
" v_mul_f32 v145, v18, v145 row_newbcast:9
\n
"
" v_mul_f32 v146, v18, v146 row_newbcast:10
\n
"
" v_mul_f32 v147, v18, v147 row_newbcast:11
\n
"
" v_mul_f32 v148, v18, v148 row_newbcast:8
\n
"
" v_mul_f32 v149, v18, v149 row_newbcast:9
\n
"
" v_mul_f32 v150, v18, v150 row_newbcast:10
\n
"
" v_mul_f32 v151, v18, v151 row_newbcast:11
\n
"
" v_mul_f32 v152, v18, v152 row_newbcast:12
\n
"
" v_mul_f32 v153, v18, v153 row_newbcast:13
\n
"
" v_mul_f32 v154, v18, v154 row_newbcast:14
\n
"
" v_mul_f32 v155, v18, v155 row_newbcast:15
\n
"
" v_mul_f32 v156, v18, v156 row_newbcast:12
\n
"
" v_mul_f32 v157, v18, v157 row_newbcast:13
\n
"
" v_mul_f32 v158, v18, v158 row_newbcast:14
\n
"
" v_mul_f32 v159, v18, v159 row_newbcast:15
\n
"
" v_mul_f32 v160, v19, v160 row_newbcast:0
\n
"
" v_mul_f32 v161, v19, v161 row_newbcast:1
\n
"
" v_mul_f32 v162, v19, v162 row_newbcast:2
\n
"
" v_mul_f32 v163, v19, v163 row_newbcast:3
\n
"
" v_mul_f32 v164, v19, v164 row_newbcast:0
\n
"
" v_mul_f32 v165, v19, v165 row_newbcast:1
\n
"
" v_mul_f32 v166, v19, v166 row_newbcast:2
\n
"
" v_mul_f32 v167, v19, v167 row_newbcast:3
\n
"
" v_mul_f32 v168, v19, v168 row_newbcast:4
\n
"
" v_mul_f32 v169, v19, v169 row_newbcast:5
\n
"
" v_mul_f32 v170, v19, v170 row_newbcast:6
\n
"
" v_mul_f32 v171, v19, v171 row_newbcast:7
\n
"
" v_mul_f32 v172, v19, v172 row_newbcast:4
\n
"
" v_mul_f32 v173, v19, v173 row_newbcast:5
\n
"
" v_mul_f32 v174, v19, v174 row_newbcast:6
\n
"
" v_mul_f32 v175, v19, v175 row_newbcast:7
\n
"
" v_mul_f32 v176, v19, v176 row_newbcast:8
\n
"
" v_mul_f32 v177, v19, v177 row_newbcast:9
\n
"
" v_mul_f32 v178, v19, v178 row_newbcast:10
\n
"
" v_mul_f32 v179, v19, v179 row_newbcast:11
\n
"
" v_mul_f32 v180, v19, v180 row_newbcast:8
\n
"
" v_mul_f32 v181, v19, v181 row_newbcast:9
\n
"
" v_mul_f32 v182, v19, v182 row_newbcast:10
\n
"
" v_mul_f32 v183, v19, v183 row_newbcast:11
\n
"
" v_mul_f32 v184, v19, v184 row_newbcast:12
\n
"
" v_mul_f32 v185, v19, v185 row_newbcast:13
\n
"
" v_mul_f32 v186, v19, v186 row_newbcast:14
\n
"
" v_mul_f32 v187, v19, v187 row_newbcast:15
\n
"
" v_mul_f32 v188, v19, v188 row_newbcast:12
\n
"
" v_mul_f32 v189, v19, v189 row_newbcast:13
\n
"
" v_mul_f32 v190, v19, v190 row_newbcast:14
\n
"
" v_mul_f32 v191, v19, v191 row_newbcast:15
\n
"
" buffer_load_dword v12, v5, s[16:19], 0 offen
\n
"
" v_mov_b32 v22, 0x358637bd
\n
"
" v_mov_b32 v23, 0x358637bd
\n
"
" v_max3_f32 v22, abs(v128), abs(v129), v22
\n
"
" v_max3_f32 v22, abs(v130), abs(v131), v22
\n
"
" v_max3_f32 v23, abs(v132), abs(v133), v23
\n
"
" v_max3_f32 v23, abs(v134), abs(v135), v23
\n
"
" v_max3_f32 v22, abs(v136), abs(v137), v22
\n
"
" v_max3_f32 v22, abs(v138), abs(v139), v22
\n
"
" v_max3_f32 v23, abs(v140), abs(v141), v23
\n
"
" v_max3_f32 v23, abs(v142), abs(v143), v23
\n
"
" v_max3_f32 v22, abs(v144), abs(v145), v22
\n
"
" v_max3_f32 v22, abs(v146), abs(v147), v22
\n
"
" v_max3_f32 v23, abs(v148), abs(v149), v23
\n
"
" v_max3_f32 v23, abs(v150), abs(v151), v23
\n
"
" v_max3_f32 v22, abs(v152), abs(v153), v22
\n
"
" v_max3_f32 v22, abs(v154), abs(v155), v22
\n
"
" v_max3_f32 v23, abs(v156), abs(v157), v23
\n
"
" v_max3_f32 v23, abs(v158), abs(v159), v23
\n
"
" v_max3_f32 v22, abs(v160), abs(v161), v22
\n
"
" v_max3_f32 v22, abs(v162), abs(v163), v22
\n
"
" v_max3_f32 v23, abs(v164), abs(v165), v23
\n
"
" v_max3_f32 v23, abs(v166), abs(v167), v23
\n
"
" v_max3_f32 v22, abs(v168), abs(v169), v22
\n
"
" v_max3_f32 v22, abs(v170), abs(v171), v22
\n
"
" v_max3_f32 v23, abs(v172), abs(v173), v23
\n
"
" v_max3_f32 v23, abs(v174), abs(v175), v23
\n
"
" v_max3_f32 v22, abs(v176), abs(v177), v22
\n
"
" v_max3_f32 v22, abs(v178), abs(v179), v22
\n
"
" v_max3_f32 v23, abs(v180), abs(v181), v23
\n
"
" v_max3_f32 v23, abs(v182), abs(v183), v23
\n
"
" v_max3_f32 v22, abs(v184), abs(v185), v22
\n
"
" v_max3_f32 v22, abs(v186), abs(v187), v22
\n
"
" v_max3_f32 v23, abs(v188), abs(v189), v23
\n
"
" v_max3_f32 v23, abs(v190), abs(v191), v23
\n
"
" v_lshlrev_b32 v54, 3, v0
\n
"
" s_mul_i32 s60, 0x00000200, s7
\n
"
" v_add_u32 v54, s60, v54
\n
"
" ds_write_b64 v54, v[22:23] offset:16640
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" v_and_b32 v54, 15, v0
\n
"
" v_lshlrev_b32 v54, 3, v54
\n
"
" ds_read_b64 v[96:97], v54 offset:16640
\n
"
" ds_read_b64 v[98:99], v54 offset:16768
\n
"
" ds_read_b64 v[100:101], v54 offset:16896
\n
"
" ds_read_b64 v[102:103], v54 offset:17024
\n
"
" ds_read_b64 v[104:105], v54 offset:17152
\n
"
" ds_read_b64 v[106:107], v54 offset:17280
\n
"
" ds_read_b64 v[108:109], v54 offset:17408
\n
"
" ds_read_b64 v[110:111], v54 offset:17536
\n
"
" ds_read_b64 v[112:113], v54 offset:17664
\n
"
" ds_read_b64 v[114:115], v54 offset:17792
\n
"
" ds_read_b64 v[116:117], v54 offset:17920
\n
"
" ds_read_b64 v[118:119], v54 offset:18048
\n
"
" ds_read_b64 v[120:121], v54 offset:18176
\n
"
" ds_read_b64 v[122:123], v54 offset:18304
\n
"
" ds_read_b64 v[124:125], v54 offset:18432
\n
"
" ds_read_b64 v[126:127], v54 offset:18560
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" v_max3_f32 v22, abs(v96), abs(v98), v22
\n
"
" v_max3_f32 v23, abs(v97), abs(v99), v23
\n
"
" v_max3_f32 v22, abs(v100), abs(v102), v22
\n
"
" v_max3_f32 v23, abs(v101), abs(v103), v23
\n
"
" v_max3_f32 v22, abs(v104), abs(v106), v22
\n
"
" v_max3_f32 v23, abs(v105), abs(v107), v23
\n
"
" v_max3_f32 v22, abs(v108), abs(v110), v22
\n
"
" v_max3_f32 v23, abs(v109), abs(v111), v23
\n
"
" v_max3_f32 v22, abs(v112), abs(v114), v22
\n
"
" v_max3_f32 v23, abs(v113), abs(v115), v23
\n
"
" v_max3_f32 v22, abs(v116), abs(v118), v22
\n
"
" v_max3_f32 v23, abs(v117), abs(v119), v23
\n
"
" v_max3_f32 v22, abs(v120), abs(v122), v22
\n
"
" v_max3_f32 v23, abs(v121), abs(v123), v23
\n
"
" v_max3_f32 v22, abs(v124), abs(v126), v22
\n
"
" v_max3_f32 v23, abs(v125), abs(v127), v23
\n
"
" v_rcp_f32 v22, v22
\n
"
" v_rcp_f32 v23, v23
\n
"
" v_mul_f32 v22, 0x42fe0000, v22
\n
"
" v_mul_f32 v23, 0x42fe0000, v23
\n
"
" v_mul_f32 v128, v22, v128
\n
"
" v_mul_f32 v129, v22, v129
\n
"
" v_mul_f32 v130, v22, v130
\n
"
" v_mul_f32 v131, v22, v131
\n
"
" v_cvt_i32_f32 v128, v128
\n
"
" v_cvt_i32_f32 v129, v129
\n
"
" v_cvt_i32_f32 v130, v130
\n
"
" v_cvt_i32_f32 v131, v131
\n
"
" v_perm_b32 v128, v129, v128, s53
\n
"
" v_perm_b32 v128, v130, v128, s54
\n
"
" v_perm_b32 v128, v131, v128, s55
\n
"
" v_mul_f32 v132, v23, v132
\n
"
" v_mul_f32 v133, v23, v133
\n
"
" v_mul_f32 v134, v23, v134
\n
"
" v_mul_f32 v135, v23, v135
\n
"
" v_cvt_i32_f32 v132, v132
\n
"
" v_cvt_i32_f32 v133, v133
\n
"
" v_cvt_i32_f32 v134, v134
\n
"
" v_cvt_i32_f32 v135, v135
\n
"
" v_perm_b32 v129, v133, v132, s53
\n
"
" v_perm_b32 v129, v134, v129, s54
\n
"
" v_perm_b32 v129, v135, v129, s55
\n
"
" v_mul_f32 v136, v22, v136
\n
"
" v_mul_f32 v137, v22, v137
\n
"
" v_mul_f32 v138, v22, v138
\n
"
" v_mul_f32 v139, v22, v139
\n
"
" v_cvt_i32_f32 v136, v136
\n
"
" v_cvt_i32_f32 v137, v137
\n
"
" v_cvt_i32_f32 v138, v138
\n
"
" v_cvt_i32_f32 v139, v139
\n
"
" v_perm_b32 v130, v137, v136, s53
\n
"
" v_perm_b32 v130, v138, v130, s54
\n
"
" v_perm_b32 v130, v139, v130, s55
\n
"
" v_mul_f32 v140, v23, v140
\n
"
" v_mul_f32 v141, v23, v141
\n
"
" v_mul_f32 v142, v23, v142
\n
"
" v_mul_f32 v143, v23, v143
\n
"
" v_cvt_i32_f32 v140, v140
\n
"
" v_cvt_i32_f32 v141, v141
\n
"
" v_cvt_i32_f32 v142, v142
\n
"
" v_cvt_i32_f32 v143, v143
\n
"
" v_perm_b32 v131, v141, v140, s53
\n
"
" v_perm_b32 v131, v142, v131, s54
\n
"
" v_perm_b32 v131, v143, v131, s55
\n
"
" v_mul_f32 v144, v22, v144
\n
"
" v_mul_f32 v145, v22, v145
\n
"
" v_mul_f32 v146, v22, v146
\n
"
" v_mul_f32 v147, v22, v147
\n
"
" v_cvt_i32_f32 v144, v144
\n
"
" v_cvt_i32_f32 v145, v145
\n
"
" v_cvt_i32_f32 v146, v146
\n
"
" v_cvt_i32_f32 v147, v147
\n
"
" v_perm_b32 v132, v145, v144, s53
\n
"
" v_perm_b32 v132, v146, v132, s54
\n
"
" v_perm_b32 v132, v147, v132, s55
\n
"
" v_mul_f32 v148, v23, v148
\n
"
" v_mul_f32 v149, v23, v149
\n
"
" v_mul_f32 v150, v23, v150
\n
"
" v_mul_f32 v151, v23, v151
\n
"
" v_cvt_i32_f32 v148, v148
\n
"
" v_cvt_i32_f32 v149, v149
\n
"
" v_cvt_i32_f32 v150, v150
\n
"
" v_cvt_i32_f32 v151, v151
\n
"
" v_perm_b32 v133, v149, v148, s53
\n
"
" v_perm_b32 v133, v150, v133, s54
\n
"
" v_perm_b32 v133, v151, v133, s55
\n
"
" v_mul_f32 v152, v22, v152
\n
"
" v_mul_f32 v153, v22, v153
\n
"
" v_mul_f32 v154, v22, v154
\n
"
" v_mul_f32 v155, v22, v155
\n
"
" v_cvt_i32_f32 v152, v152
\n
"
" v_cvt_i32_f32 v153, v153
\n
"
" v_cvt_i32_f32 v154, v154
\n
"
" v_cvt_i32_f32 v155, v155
\n
"
" v_perm_b32 v134, v153, v152, s53
\n
"
" v_perm_b32 v134, v154, v134, s54
\n
"
" v_perm_b32 v134, v155, v134, s55
\n
"
" v_mul_f32 v156, v23, v156
\n
"
" v_mul_f32 v157, v23, v157
\n
"
" v_mul_f32 v158, v23, v158
\n
"
" v_mul_f32 v159, v23, v159
\n
"
" v_cvt_i32_f32 v156, v156
\n
"
" v_cvt_i32_f32 v157, v157
\n
"
" v_cvt_i32_f32 v158, v158
\n
"
" v_cvt_i32_f32 v159, v159
\n
"
" v_perm_b32 v135, v157, v156, s53
\n
"
" v_perm_b32 v135, v158, v135, s54
\n
"
" v_perm_b32 v135, v159, v135, s55
\n
"
" v_mul_f32 v160, v22, v160
\n
"
" v_mul_f32 v161, v22, v161
\n
"
" v_mul_f32 v162, v22, v162
\n
"
" v_mul_f32 v163, v22, v163
\n
"
" v_cvt_i32_f32 v160, v160
\n
"
" v_cvt_i32_f32 v161, v161
\n
"
" v_cvt_i32_f32 v162, v162
\n
"
" v_cvt_i32_f32 v163, v163
\n
"
" v_perm_b32 v136, v161, v160, s53
\n
"
" v_perm_b32 v136, v162, v136, s54
\n
"
" v_perm_b32 v136, v163, v136, s55
\n
"
" v_mul_f32 v164, v23, v164
\n
"
" v_mul_f32 v165, v23, v165
\n
"
" v_mul_f32 v166, v23, v166
\n
"
" v_mul_f32 v167, v23, v167
\n
"
" v_cvt_i32_f32 v164, v164
\n
"
" v_cvt_i32_f32 v165, v165
\n
"
" v_cvt_i32_f32 v166, v166
\n
"
" v_cvt_i32_f32 v167, v167
\n
"
" v_perm_b32 v137, v165, v164, s53
\n
"
" v_perm_b32 v137, v166, v137, s54
\n
"
" v_perm_b32 v137, v167, v137, s55
\n
"
" v_mul_f32 v168, v22, v168
\n
"
" v_mul_f32 v169, v22, v169
\n
"
" v_mul_f32 v170, v22, v170
\n
"
" v_mul_f32 v171, v22, v171
\n
"
" v_cvt_i32_f32 v168, v168
\n
"
" v_cvt_i32_f32 v169, v169
\n
"
" v_cvt_i32_f32 v170, v170
\n
"
" v_cvt_i32_f32 v171, v171
\n
"
" v_perm_b32 v138, v169, v168, s53
\n
"
" v_perm_b32 v138, v170, v138, s54
\n
"
" v_perm_b32 v138, v171, v138, s55
\n
"
" v_mul_f32 v172, v23, v172
\n
"
" v_mul_f32 v173, v23, v173
\n
"
" v_mul_f32 v174, v23, v174
\n
"
" v_mul_f32 v175, v23, v175
\n
"
" v_cvt_i32_f32 v172, v172
\n
"
" v_cvt_i32_f32 v173, v173
\n
"
" v_cvt_i32_f32 v174, v174
\n
"
" v_cvt_i32_f32 v175, v175
\n
"
" v_perm_b32 v139, v173, v172, s53
\n
"
" v_perm_b32 v139, v174, v139, s54
\n
"
" v_perm_b32 v139, v175, v139, s55
\n
"
" v_mul_f32 v176, v22, v176
\n
"
" v_mul_f32 v177, v22, v177
\n
"
" v_mul_f32 v178, v22, v178
\n
"
" v_mul_f32 v179, v22, v179
\n
"
" v_cvt_i32_f32 v176, v176
\n
"
" v_cvt_i32_f32 v177, v177
\n
"
" v_cvt_i32_f32 v178, v178
\n
"
" v_cvt_i32_f32 v179, v179
\n
"
" v_perm_b32 v140, v177, v176, s53
\n
"
" v_perm_b32 v140, v178, v140, s54
\n
"
" v_perm_b32 v140, v179, v140, s55
\n
"
" v_mul_f32 v180, v23, v180
\n
"
" v_mul_f32 v181, v23, v181
\n
"
" v_mul_f32 v182, v23, v182
\n
"
" v_mul_f32 v183, v23, v183
\n
"
" v_cvt_i32_f32 v180, v180
\n
"
" v_cvt_i32_f32 v181, v181
\n
"
" v_cvt_i32_f32 v182, v182
\n
"
" v_cvt_i32_f32 v183, v183
\n
"
" v_perm_b32 v141, v181, v180, s53
\n
"
" v_perm_b32 v141, v182, v141, s54
\n
"
" v_perm_b32 v141, v183, v141, s55
\n
"
" v_mul_f32 v184, v22, v184
\n
"
" v_mul_f32 v185, v22, v185
\n
"
" v_mul_f32 v186, v22, v186
\n
"
" v_mul_f32 v187, v22, v187
\n
"
" v_cvt_i32_f32 v184, v184
\n
"
" v_cvt_i32_f32 v185, v185
\n
"
" v_cvt_i32_f32 v186, v186
\n
"
" v_cvt_i32_f32 v187, v187
\n
"
" v_perm_b32 v142, v185, v184, s53
\n
"
" v_perm_b32 v142, v186, v142, s54
\n
"
" v_perm_b32 v142, v187, v142, s55
\n
"
" v_mul_f32 v188, v23, v188
\n
"
" v_mul_f32 v189, v23, v189
\n
"
" v_mul_f32 v190, v23, v190
\n
"
" v_mul_f32 v191, v23, v191
\n
"
" v_cvt_i32_f32 v188, v188
\n
"
" v_cvt_i32_f32 v189, v189
\n
"
" v_cvt_i32_f32 v190, v190
\n
"
" v_cvt_i32_f32 v191, v191
\n
"
" v_perm_b32 v143, v189, v188, s53
\n
"
" v_perm_b32 v143, v190, v143, s54
\n
"
" v_perm_b32 v143, v191, v143, s55
\n
"
" v_rcp_f32 v24, v22
\n
"
" v_rcp_f32 v25, v23
\n
"
" v_lshrrev_b32 v54, 5, v0
\n
"
" v_lshlrev_b32 v55, 5, v54
\n
"
" v_and_b32 v54, 31, v0
\n
"
" v_lshrrev_b32 v56, 4, v54
\n
"
" v_add_u32 v55, v56, v55
\n
"
" v_and_b32 v54, 15, v0
\n
"
" v_lshlrev_b32 v54, 1, v54
\n
"
" v_add_u32 v55, v54, v55
\n
"
" v_lshlrev_b32 v54, 2, v55
\n
"
" s_mul_i32 s60, 0x00000100, s7
\n
"
" v_add_u32 v54, v54, s60
\n
"
" ds_write_b32 v54, v128 offset:18688
\n
"
" ds_write_b32 v54, v129 offset:26880
\n
"
" ds_write_b32 v54, v130 offset:19712
\n
"
" ds_write_b32 v54, v131 offset:27904
\n
"
" ds_write_b32 v54, v132 offset:20736
\n
"
" ds_write_b32 v54, v133 offset:28928
\n
"
" ds_write_b32 v54, v134 offset:21760
\n
"
" ds_write_b32 v54, v135 offset:29952
\n
"
" ds_write_b32 v54, v136 offset:22784
\n
"
" ds_write_b32 v54, v137 offset:30976
\n
"
" ds_write_b32 v54, v138 offset:23808
\n
"
" ds_write_b32 v54, v139 offset:32000
\n
"
" ds_write_b32 v54, v140 offset:24832
\n
"
" ds_write_b32 v54, v141 offset:33024
\n
"
" ds_write_b32 v54, v142 offset:25856
\n
"
" ds_write_b32 v54, v143 offset:34048
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" v_lshrrev_b32 v54, 4, v0
\n
"
" v_lshlrev_b32 v55, 6, v54
\n
"
" v_and_b32 v54, 15, v0
\n
"
" v_lshlrev_b32 v54, 1, v54
\n
"
" v_add_u32 v55, v54, v55
\n
"
" v_lshlrev_b32 v54, 2, v55
\n
"
" ds_read_b64 v[128:129], v54 offset:18688
\n
"
" ds_read_b64 v[130:131], v54 offset:18816
\n
"
" ds_read_b64 v[132:133], v54 offset:19712
\n
"
" ds_read_b64 v[134:135], v54 offset:19840
\n
"
" ds_read_b64 v[136:137], v54 offset:20736
\n
"
" ds_read_b64 v[138:139], v54 offset:20864
\n
"
" ds_read_b64 v[140:141], v54 offset:21760
\n
"
" ds_read_b64 v[142:143], v54 offset:21888
\n
"
" ds_read_b64 v[144:145], v54 offset:22784
\n
"
" ds_read_b64 v[146:147], v54 offset:22912
\n
"
" ds_read_b64 v[148:149], v54 offset:23808
\n
"
" ds_read_b64 v[150:151], v54 offset:23936
\n
"
" ds_read_b64 v[152:153], v54 offset:24832
\n
"
" ds_read_b64 v[154:155], v54 offset:24960
\n
"
" ds_read_b64 v[156:157], v54 offset:25856
\n
"
" ds_read_b64 v[158:159], v54 offset:25984
\n
"
" ds_read_b64 v[160:161], v54 offset:26880
\n
"
" ds_read_b64 v[162:163], v54 offset:27008
\n
"
" ds_read_b64 v[164:165], v54 offset:27904
\n
"
" ds_read_b64 v[166:167], v54 offset:28032
\n
"
" ds_read_b64 v[168:169], v54 offset:28928
\n
"
" ds_read_b64 v[170:171], v54 offset:29056
\n
"
" ds_read_b64 v[172:173], v54 offset:29952
\n
"
" ds_read_b64 v[174:175], v54 offset:30080
\n
"
" ds_read_b64 v[176:177], v54 offset:30976
\n
"
" ds_read_b64 v[178:179], v54 offset:31104
\n
"
" ds_read_b64 v[180:181], v54 offset:32000
\n
"
" ds_read_b64 v[182:183], v54 offset:32128
\n
"
" ds_read_b64 v[184:185], v54 offset:33024
\n
"
" ds_read_b64 v[186:187], v54 offset:33152
\n
"
" ds_read_b64 v[188:189], v54 offset:34048
\n
"
" ds_read_b64 v[190:191], v54 offset:34176
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s16, %[s_tile_os_dq], s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_mov_b32 s80, 0
\n
"
" s_waitcnt 0x0000
\n
"
"label_0C3C:
\n
"
" s_waitcnt vmcnt(41)
\n
"
" s_barrier
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" s_add_u32 s12, %[s_tile_os_b_half], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[62:63], v[174:175], v[220:223]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207]
\n
"
" s_waitcnt vmcnt(40)
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211]
\n
"
" buffer_load_dword v13, v5, s[16:19], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223]
\n
"
" s_add_u32 s60, 0x00000200, s80
\n
"
" s_cmp_lt_u32 s60, s81
\n
"
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0
\n
"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0
\n
"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s16, %[s_tile_os_dq], s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" v_cvt_f32_i32 v192, v192
\n
"
" v_cvt_f32_i32 v193, v193
\n
"
" v_cvt_f32_i32 v194, v194
\n
"
" v_cvt_f32_i32 v195, v195
\n
"
" v_mul_f32 v192, v24, v192
\n
"
" v_mul_f32 v193, v24, v193
\n
"
" v_mul_f32 v194, v24, v194
\n
"
" v_mul_f32 v195, v24, v195
\n
"
" v_mul_f32 v192, v12, v192 row_newbcast:0
\n
"
" v_mul_f32 v193, v12, v193 row_newbcast:1
\n
"
" v_mul_f32 v194, v12, v194 row_newbcast:2
\n
"
" v_mul_f32 v195, v12, v195 row_newbcast:3
\n
"
" v_mul_f32 v192, v20, v192
\n
"
" v_mul_f32 v193, v20, v193
\n
"
" v_mul_f32 v194, v20, v194
\n
"
" v_mul_f32 v195, v20, v195
\n
"
" v_cvt_f32_i32 v196, v196
\n
"
" v_cvt_f32_i32 v197, v197
\n
"
" v_cvt_f32_i32 v198, v198
\n
"
" v_cvt_f32_i32 v199, v199
\n
"
" v_mul_f32 v196, v25, v196
\n
"
" v_mul_f32 v197, v25, v197
\n
"
" v_mul_f32 v198, v25, v198
\n
"
" v_mul_f32 v199, v25, v199
\n
"
" v_mul_f32 v196, v12, v196 row_newbcast:0
\n
"
" v_mul_f32 v197, v12, v197 row_newbcast:1
\n
"
" v_mul_f32 v198, v12, v198 row_newbcast:2
\n
"
" v_mul_f32 v199, v12, v199 row_newbcast:3
\n
"
" v_mul_f32 v196, v21, v196
\n
"
" v_mul_f32 v197, v21, v197
\n
"
" v_mul_f32 v198, v21, v198
\n
"
" v_mul_f32 v199, v21, v199
\n
"
" v_cvt_f32_i32 v200, v200
\n
"
" v_cvt_f32_i32 v201, v201
\n
"
" v_cvt_f32_i32 v202, v202
\n
"
" v_cvt_f32_i32 v203, v203
\n
"
" v_mul_f32 v200, v24, v200
\n
"
" v_mul_f32 v201, v24, v201
\n
"
" v_mul_f32 v202, v24, v202
\n
"
" v_mul_f32 v203, v24, v203
\n
"
" v_mul_f32 v200, v12, v200 row_newbcast:4
\n
"
" v_mul_f32 v201, v12, v201 row_newbcast:5
\n
"
" v_mul_f32 v202, v12, v202 row_newbcast:6
\n
"
" v_mul_f32 v203, v12, v203 row_newbcast:7
\n
"
" v_mul_f32 v200, v20, v200
\n
"
" v_mul_f32 v201, v20, v201
\n
"
" v_mul_f32 v202, v20, v202
\n
"
" v_mul_f32 v203, v20, v203
\n
"
" v_cvt_f32_i32 v204, v204
\n
"
" v_cvt_f32_i32 v205, v205
\n
"
" v_cvt_f32_i32 v206, v206
\n
"
" v_cvt_f32_i32 v207, v207
\n
"
" v_mul_f32 v204, v25, v204
\n
"
" v_mul_f32 v205, v25, v205
\n
"
" v_mul_f32 v206, v25, v206
\n
"
" v_mul_f32 v207, v25, v207
\n
"
" v_mul_f32 v204, v12, v204 row_newbcast:4
\n
"
" v_mul_f32 v205, v12, v205 row_newbcast:5
\n
"
" v_mul_f32 v206, v12, v206 row_newbcast:6
\n
"
" v_mul_f32 v207, v12, v207 row_newbcast:7
\n
"
" v_mul_f32 v204, v21, v204
\n
"
" v_mul_f32 v205, v21, v205
\n
"
" v_mul_f32 v206, v21, v206
\n
"
" v_mul_f32 v207, v21, v207
\n
"
" v_cvt_f32_i32 v208, v208
\n
"
" v_cvt_f32_i32 v209, v209
\n
"
" v_cvt_f32_i32 v210, v210
\n
"
" v_cvt_f32_i32 v211, v211
\n
"
" v_mul_f32 v208, v24, v208
\n
"
" v_mul_f32 v209, v24, v209
\n
"
" v_mul_f32 v210, v24, v210
\n
"
" v_mul_f32 v211, v24, v211
\n
"
" v_mul_f32 v208, v12, v208 row_newbcast:8
\n
"
" v_mul_f32 v209, v12, v209 row_newbcast:9
\n
"
" v_mul_f32 v210, v12, v210 row_newbcast:10
\n
"
" v_mul_f32 v211, v12, v211 row_newbcast:11
\n
"
" v_mul_f32 v208, v20, v208
\n
"
" v_mul_f32 v209, v20, v209
\n
"
" v_mul_f32 v210, v20, v210
\n
"
" v_mul_f32 v211, v20, v211
\n
"
" v_cvt_f32_i32 v212, v212
\n
"
" v_cvt_f32_i32 v213, v213
\n
"
" v_cvt_f32_i32 v214, v214
\n
"
" v_cvt_f32_i32 v215, v215
\n
"
" v_mul_f32 v212, v25, v212
\n
"
" v_mul_f32 v213, v25, v213
\n
"
" v_mul_f32 v214, v25, v214
\n
"
" v_mul_f32 v215, v25, v215
\n
"
" v_mul_f32 v212, v12, v212 row_newbcast:8
\n
"
" v_mul_f32 v213, v12, v213 row_newbcast:9
\n
"
" v_mul_f32 v214, v12, v214 row_newbcast:10
\n
"
" v_mul_f32 v215, v12, v215 row_newbcast:11
\n
"
" v_mul_f32 v212, v21, v212
\n
"
" v_mul_f32 v213, v21, v213
\n
"
" v_mul_f32 v214, v21, v214
\n
"
" v_mul_f32 v215, v21, v215
\n
"
" v_cvt_f32_i32 v216, v216
\n
"
" v_cvt_f32_i32 v217, v217
\n
"
" v_cvt_f32_i32 v218, v218
\n
"
" v_cvt_f32_i32 v219, v219
\n
"
" v_mul_f32 v216, v24, v216
\n
"
" v_mul_f32 v217, v24, v217
\n
"
" v_mul_f32 v218, v24, v218
\n
"
" v_mul_f32 v219, v24, v219
\n
"
" v_mul_f32 v216, v12, v216 row_newbcast:12
\n
"
" v_mul_f32 v217, v12, v217 row_newbcast:13
\n
"
" v_mul_f32 v218, v12, v218 row_newbcast:14
\n
"
" v_mul_f32 v219, v12, v219 row_newbcast:15
\n
"
" v_mul_f32 v216, v20, v216
\n
"
" v_mul_f32 v217, v20, v217
\n
"
" v_mul_f32 v218, v20, v218
\n
"
" v_mul_f32 v219, v20, v219
\n
"
" v_cvt_f32_i32 v220, v220
\n
"
" v_cvt_f32_i32 v221, v221
\n
"
" v_cvt_f32_i32 v222, v222
\n
"
" v_cvt_f32_i32 v223, v223
\n
"
" v_mul_f32 v220, v25, v220
\n
"
" v_mul_f32 v221, v25, v221
\n
"
" v_mul_f32 v222, v25, v222
\n
"
" v_mul_f32 v223, v25, v223
\n
"
" v_mul_f32 v220, v12, v220 row_newbcast:12
\n
"
" v_mul_f32 v221, v12, v221 row_newbcast:13
\n
"
" v_mul_f32 v222, v12, v222 row_newbcast:14
\n
"
" v_mul_f32 v223, v12, v223 row_newbcast:15
\n
"
" v_mul_f32 v220, v21, v220
\n
"
" v_mul_f32 v221, v21, v221
\n
"
" v_mul_f32 v222, v21, v222
\n
"
" v_mul_f32 v223, v21, v223
\n
"
" v_cmp_u_f32 s[48:49], v192, v192
\n
"
" v_add3_u32 v50, v192, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v193, v193
\n
"
" v_add3_u32 v50, v193, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v192, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v194, v194
\n
"
" v_add3_u32 v50, v194, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v195, v195
\n
"
" v_add3_u32 v50, v195, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v193, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v196, v196
\n
"
" v_add3_u32 v50, v196, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v197, v197
\n
"
" v_add3_u32 v50, v197, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v194, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v198, v198
\n
"
" v_add3_u32 v50, v198, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v199, v199
\n
"
" v_add3_u32 v50, v199, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v195, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v200, v200
\n
"
" v_add3_u32 v50, v200, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v201, v201
\n
"
" v_add3_u32 v50, v201, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v196, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v202, v202
\n
"
" v_add3_u32 v50, v202, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v203, v203
\n
"
" v_add3_u32 v50, v203, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v197, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v204, v204
\n
"
" v_add3_u32 v50, v204, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v205, v205
\n
"
" v_add3_u32 v50, v205, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v198, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v206, v206
\n
"
" v_add3_u32 v50, v206, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v207, v207
\n
"
" v_add3_u32 v50, v207, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v199, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v208, v208
\n
"
" v_add3_u32 v50, v208, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v209, v209
\n
"
" v_add3_u32 v50, v209, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v200, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v210, v210
\n
"
" v_add3_u32 v50, v210, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v211, v211
\n
"
" v_add3_u32 v50, v211, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v201, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v212, v212
\n
"
" v_add3_u32 v50, v212, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v213, v213
\n
"
" v_add3_u32 v50, v213, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v202, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v214, v214
\n
"
" v_add3_u32 v50, v214, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v215, v215
\n
"
" v_add3_u32 v50, v215, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v203, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v216, v216
\n
"
" v_add3_u32 v50, v216, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v217, v217
\n
"
" v_add3_u32 v50, v217, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v204, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v218, v218
\n
"
" v_add3_u32 v50, v218, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v219, v219
\n
"
" v_add3_u32 v50, v219, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v205, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v220, v220
\n
"
" v_add3_u32 v50, v220, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v221, v221
\n
"
" v_add3_u32 v50, v221, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v206, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v222, v222
\n
"
" v_add3_u32 v50, v222, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v223, v223
\n
"
" v_add3_u32 v50, v223, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v207, v55, v54, s52
\n
"
" ds_write_b64 v3, v[192:193] offset:35072
\n
"
" ds_write_b64 v3, v[194:195] offset:43776
\n
"
" ds_write_b64 v3, v[196:197] offset:37248
\n
"
" ds_write_b64 v3, v[198:199] offset:45952
\n
"
" ds_write_b64 v3, v[200:201] offset:39424
\n
"
" ds_write_b64 v3, v[202:203] offset:48128
\n
"
" ds_write_b64 v3, v[204:205] offset:41600
\n
"
" ds_write_b64 v3, v[206:207] offset:50304
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v64, v4 offset:35072
\n
"
" ds_read_b32 v65, v4 offset:39424
\n
"
" ds_read_b32 v66, v4 offset:35104
\n
"
" ds_read_b32 v67, v4 offset:39456
\n
"
" ds_read_b32 v68, v4 offset:35136
\n
"
" ds_read_b32 v69, v4 offset:39488
\n
"
" ds_read_b32 v70, v4 offset:35168
\n
"
" ds_read_b32 v71, v4 offset:39520
\n
"
" ds_read_b32 v72, v4 offset:43776
\n
"
" ds_read_b32 v73, v4 offset:48128
\n
"
" ds_read_b32 v74, v4 offset:43808
\n
"
" ds_read_b32 v75, v4 offset:48160
\n
"
" ds_read_b32 v76, v4 offset:43840
\n
"
" ds_read_b32 v77, v4 offset:48192
\n
"
" ds_read_b32 v78, v4 offset:43872
\n
"
" ds_read_b32 v79, v4 offset:48224
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16 v80, v64, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16 v80, v65, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16 v82, v66, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16 v82, v67, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16 v84, v68, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16 v84, v69, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16 v86, v70, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16 v86, v71, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16 v88, v72, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16 v88, v73, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16 v90, v74, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16 v90, v75, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16 v92, v76, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16 v92, v77, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16 v94, v78, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16 v94, v79, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_addk_i32 s80, 0x0100
\n
"
" s_cmp_lt_i32 s80, s81
\n
"
" s_cbranch_scc0 label_2301
\n
"
" s_waitcnt vmcnt(41)
\n
"
" s_barrier
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" s_add_u32 s12, %[s_tile_os_b_half], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[190:191], v[174:175], v[252:255]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239]
\n
"
" s_waitcnt vmcnt(40)
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243]
\n
"
" buffer_load_dword v12, v5, s[16:19], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255]
\n
"
" s_add_u32 s60, 0x00000200, s80
\n
"
" s_cmp_lt_u32 s60, s81
\n
"
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0
\n
"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0
\n
"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s16, %[s_tile_os_dq], s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" v_cvt_f32_i32 v224, v224
\n
"
" v_cvt_f32_i32 v225, v225
\n
"
" v_cvt_f32_i32 v226, v226
\n
"
" v_cvt_f32_i32 v227, v227
\n
"
" v_mul_f32 v224, v24, v224
\n
"
" v_mul_f32 v225, v24, v225
\n
"
" v_mul_f32 v226, v24, v226
\n
"
" v_mul_f32 v227, v24, v227
\n
"
" v_mul_f32 v224, v13, v224 row_newbcast:0
\n
"
" v_mul_f32 v225, v13, v225 row_newbcast:1
\n
"
" v_mul_f32 v226, v13, v226 row_newbcast:2
\n
"
" v_mul_f32 v227, v13, v227 row_newbcast:3
\n
"
" v_mul_f32 v224, v20, v224
\n
"
" v_mul_f32 v225, v20, v225
\n
"
" v_mul_f32 v226, v20, v226
\n
"
" v_mul_f32 v227, v20, v227
\n
"
" v_cvt_f32_i32 v228, v228
\n
"
" v_cvt_f32_i32 v229, v229
\n
"
" v_cvt_f32_i32 v230, v230
\n
"
" v_cvt_f32_i32 v231, v231
\n
"
" v_mul_f32 v228, v25, v228
\n
"
" v_mul_f32 v229, v25, v229
\n
"
" v_mul_f32 v230, v25, v230
\n
"
" v_mul_f32 v231, v25, v231
\n
"
" v_mul_f32 v228, v13, v228 row_newbcast:0
\n
"
" v_mul_f32 v229, v13, v229 row_newbcast:1
\n
"
" v_mul_f32 v230, v13, v230 row_newbcast:2
\n
"
" v_mul_f32 v231, v13, v231 row_newbcast:3
\n
"
" v_mul_f32 v228, v21, v228
\n
"
" v_mul_f32 v229, v21, v229
\n
"
" v_mul_f32 v230, v21, v230
\n
"
" v_mul_f32 v231, v21, v231
\n
"
" v_cvt_f32_i32 v232, v232
\n
"
" v_cvt_f32_i32 v233, v233
\n
"
" v_cvt_f32_i32 v234, v234
\n
"
" v_cvt_f32_i32 v235, v235
\n
"
" v_mul_f32 v232, v24, v232
\n
"
" v_mul_f32 v233, v24, v233
\n
"
" v_mul_f32 v234, v24, v234
\n
"
" v_mul_f32 v235, v24, v235
\n
"
" v_mul_f32 v232, v13, v232 row_newbcast:4
\n
"
" v_mul_f32 v233, v13, v233 row_newbcast:5
\n
"
" v_mul_f32 v234, v13, v234 row_newbcast:6
\n
"
" v_mul_f32 v235, v13, v235 row_newbcast:7
\n
"
" v_mul_f32 v232, v20, v232
\n
"
" v_mul_f32 v233, v20, v233
\n
"
" v_mul_f32 v234, v20, v234
\n
"
" v_mul_f32 v235, v20, v235
\n
"
" v_cvt_f32_i32 v236, v236
\n
"
" v_cvt_f32_i32 v237, v237
\n
"
" v_cvt_f32_i32 v238, v238
\n
"
" v_cvt_f32_i32 v239, v239
\n
"
" v_mul_f32 v236, v25, v236
\n
"
" v_mul_f32 v237, v25, v237
\n
"
" v_mul_f32 v238, v25, v238
\n
"
" v_mul_f32 v239, v25, v239
\n
"
" v_mul_f32 v236, v13, v236 row_newbcast:4
\n
"
" v_mul_f32 v237, v13, v237 row_newbcast:5
\n
"
" v_mul_f32 v238, v13, v238 row_newbcast:6
\n
"
" v_mul_f32 v239, v13, v239 row_newbcast:7
\n
"
" v_mul_f32 v236, v21, v236
\n
"
" v_mul_f32 v237, v21, v237
\n
"
" v_mul_f32 v238, v21, v238
\n
"
" v_mul_f32 v239, v21, v239
\n
"
" v_cvt_f32_i32 v240, v240
\n
"
" v_cvt_f32_i32 v241, v241
\n
"
" v_cvt_f32_i32 v242, v242
\n
"
" v_cvt_f32_i32 v243, v243
\n
"
" v_mul_f32 v240, v24, v240
\n
"
" v_mul_f32 v241, v24, v241
\n
"
" v_mul_f32 v242, v24, v242
\n
"
" v_mul_f32 v243, v24, v243
\n
"
" v_mul_f32 v240, v13, v240 row_newbcast:8
\n
"
" v_mul_f32 v241, v13, v241 row_newbcast:9
\n
"
" v_mul_f32 v242, v13, v242 row_newbcast:10
\n
"
" v_mul_f32 v243, v13, v243 row_newbcast:11
\n
"
" v_mul_f32 v240, v20, v240
\n
"
" v_mul_f32 v241, v20, v241
\n
"
" v_mul_f32 v242, v20, v242
\n
"
" v_mul_f32 v243, v20, v243
\n
"
" v_cvt_f32_i32 v244, v244
\n
"
" v_cvt_f32_i32 v245, v245
\n
"
" v_cvt_f32_i32 v246, v246
\n
"
" v_cvt_f32_i32 v247, v247
\n
"
" v_mul_f32 v244, v25, v244
\n
"
" v_mul_f32 v245, v25, v245
\n
"
" v_mul_f32 v246, v25, v246
\n
"
" v_mul_f32 v247, v25, v247
\n
"
" v_mul_f32 v244, v13, v244 row_newbcast:8
\n
"
" v_mul_f32 v245, v13, v245 row_newbcast:9
\n
"
" v_mul_f32 v246, v13, v246 row_newbcast:10
\n
"
" v_mul_f32 v247, v13, v247 row_newbcast:11
\n
"
" v_mul_f32 v244, v21, v244
\n
"
" v_mul_f32 v245, v21, v245
\n
"
" v_mul_f32 v246, v21, v246
\n
"
" v_mul_f32 v247, v21, v247
\n
"
" v_cvt_f32_i32 v248, v248
\n
"
" v_cvt_f32_i32 v249, v249
\n
"
" v_cvt_f32_i32 v250, v250
\n
"
" v_cvt_f32_i32 v251, v251
\n
"
" v_mul_f32 v248, v24, v248
\n
"
" v_mul_f32 v249, v24, v249
\n
"
" v_mul_f32 v250, v24, v250
\n
"
" v_mul_f32 v251, v24, v251
\n
"
" v_mul_f32 v248, v13, v248 row_newbcast:12
\n
"
" v_mul_f32 v249, v13, v249 row_newbcast:13
\n
"
" v_mul_f32 v250, v13, v250 row_newbcast:14
\n
"
" v_mul_f32 v251, v13, v251 row_newbcast:15
\n
"
" v_mul_f32 v248, v20, v248
\n
"
" v_mul_f32 v249, v20, v249
\n
"
" v_mul_f32 v250, v20, v250
\n
"
" v_mul_f32 v251, v20, v251
\n
"
" v_cvt_f32_i32 v252, v252
\n
"
" v_cvt_f32_i32 v253, v253
\n
"
" v_cvt_f32_i32 v254, v254
\n
"
" v_cvt_f32_i32 v255, v255
\n
"
" v_mul_f32 v252, v25, v252
\n
"
" v_mul_f32 v253, v25, v253
\n
"
" v_mul_f32 v254, v25, v254
\n
"
" v_mul_f32 v255, v25, v255
\n
"
" v_mul_f32 v252, v13, v252 row_newbcast:12
\n
"
" v_mul_f32 v253, v13, v253 row_newbcast:13
\n
"
" v_mul_f32 v254, v13, v254 row_newbcast:14
\n
"
" v_mul_f32 v255, v13, v255 row_newbcast:15
\n
"
" v_mul_f32 v252, v21, v252
\n
"
" v_mul_f32 v253, v21, v253
\n
"
" v_mul_f32 v254, v21, v254
\n
"
" v_mul_f32 v255, v21, v255
\n
"
" v_cmp_u_f32 s[48:49], v224, v224
\n
"
" v_add3_u32 v50, v224, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v225, v225
\n
"
" v_add3_u32 v50, v225, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v224, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v226, v226
\n
"
" v_add3_u32 v50, v226, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v227, v227
\n
"
" v_add3_u32 v50, v227, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v225, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v228, v228
\n
"
" v_add3_u32 v50, v228, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v229, v229
\n
"
" v_add3_u32 v50, v229, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v226, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v230, v230
\n
"
" v_add3_u32 v50, v230, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v231, v231
\n
"
" v_add3_u32 v50, v231, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v227, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v232, v232
\n
"
" v_add3_u32 v50, v232, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v233, v233
\n
"
" v_add3_u32 v50, v233, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v228, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v234, v234
\n
"
" v_add3_u32 v50, v234, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v235, v235
\n
"
" v_add3_u32 v50, v235, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v229, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v236, v236
\n
"
" v_add3_u32 v50, v236, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v237, v237
\n
"
" v_add3_u32 v50, v237, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v230, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v238, v238
\n
"
" v_add3_u32 v50, v238, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v239, v239
\n
"
" v_add3_u32 v50, v239, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v231, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v240, v240
\n
"
" v_add3_u32 v50, v240, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v241, v241
\n
"
" v_add3_u32 v50, v241, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v232, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v242, v242
\n
"
" v_add3_u32 v50, v242, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v243, v243
\n
"
" v_add3_u32 v50, v243, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v233, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v244, v244
\n
"
" v_add3_u32 v50, v244, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v245, v245
\n
"
" v_add3_u32 v50, v245, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v234, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v246, v246
\n
"
" v_add3_u32 v50, v246, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v247, v247
\n
"
" v_add3_u32 v50, v247, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v235, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v248, v248
\n
"
" v_add3_u32 v50, v248, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v249, v249
\n
"
" v_add3_u32 v50, v249, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v236, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v250, v250
\n
"
" v_add3_u32 v50, v250, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v251, v251
\n
"
" v_add3_u32 v50, v251, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v237, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v252, v252
\n
"
" v_add3_u32 v50, v252, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v253, v253
\n
"
" v_add3_u32 v50, v253, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v238, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v254, v254
\n
"
" v_add3_u32 v50, v254, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v255, v255
\n
"
" v_add3_u32 v50, v255, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v239, v55, v54, s52
\n
"
" ds_write_b64 v3, v[224:225] offset:35072
\n
"
" ds_write_b64 v3, v[226:227] offset:43776
\n
"
" ds_write_b64 v3, v[228:229] offset:37248
\n
"
" ds_write_b64 v3, v[230:231] offset:45952
\n
"
" ds_write_b64 v3, v[232:233] offset:39424
\n
"
" ds_write_b64 v3, v[234:235] offset:48128
\n
"
" ds_write_b64 v3, v[236:237] offset:41600
\n
"
" ds_write_b64 v3, v[238:239] offset:50304
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v64, v4 offset:35072
\n
"
" ds_read_b32 v65, v4 offset:39424
\n
"
" ds_read_b32 v66, v4 offset:35104
\n
"
" ds_read_b32 v67, v4 offset:39456
\n
"
" ds_read_b32 v68, v4 offset:35136
\n
"
" ds_read_b32 v69, v4 offset:39488
\n
"
" ds_read_b32 v70, v4 offset:35168
\n
"
" ds_read_b32 v71, v4 offset:39520
\n
"
" ds_read_b32 v72, v4 offset:43776
\n
"
" ds_read_b32 v73, v4 offset:48128
\n
"
" ds_read_b32 v74, v4 offset:43808
\n
"
" ds_read_b32 v75, v4 offset:48160
\n
"
" ds_read_b32 v76, v4 offset:43840
\n
"
" ds_read_b32 v77, v4 offset:48192
\n
"
" ds_read_b32 v78, v4 offset:43872
\n
"
" ds_read_b32 v79, v4 offset:48224
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16 v80, v64, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16 v80, v65, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16 v82, v66, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16 v82, v67, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16 v84, v68, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16 v84, v69, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16 v86, v70, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16 v86, v71, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16 v88, v72, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16 v88, v73, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16 v90, v74, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16 v90, v75, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16 v92, v76, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16 v92, v77, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16 v94, v78, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16 v94, v79, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_addk_i32 s80, 0x0100
\n
"
" s_cmp_lt_i32 s80, s81
\n
"
" s_cbranch_scc0 label_2301
\n
"
" s_branch label_0C3C
\n
"
" label_2301:
\n
"
" s_waitcnt 0x0000
\n
"
" s_endpgm
\n
"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
View file @
f549173b
...
...
@@ -12,7 +12,26 @@
" v_mul_f32 a[2], v17, a[2] row_newbcast:14
\n
"
\
" v_mul_f32 a[3], v17, a[3] row_newbcast:15
\n
"
\
"s_mov_b32 s16, %[s_res_dq0]
\n
"
"s_mov_b32 s17, %[s_res_dq1]
\n
"
"s_mov_b32 s18, %[s_res_dq2]
\n
"
"s_mov_b32 s19, %[s_res_dq3]
\n
"
"s_mov_b32 s32, %[s_res_gq0]
\n
"
"s_mov_b32 s33, %[s_res_gq1]
\n
"
"s_mov_b32 s34, %[s_res_gq2]
\n
"
"s_mov_b32 s35, %[s_res_gq3]
\n
"
"s_mov_b32 s36, %[s_res_smq0]
\n
"
"s_mov_b32 s37, %[s_res_smq1]
\n
"
"s_mov_b32 s38, %[s_res_smq2]
\n
"
"s_mov_b32 s39, %[s_res_smq3]
\n
"
"s_mov_b32 s20, %[s_res_a0]
\n
"
"s_mov_b32 s21, %[s_res_a1]
\n
"
"s_mov_b32 s22, %[s_res_a2]
\n
"
"s_mov_b32 s23, %[s_res_a3]
\n
"
"s_mov_b32 s24, %[s_res_b0]
\n
"
"s_mov_b32 s25, %[s_res_b1]
\n
"
"s_mov_b32 s26, %[s_res_b2]
\n
"
"s_mov_b32 s27, %[s_res_b3]
\n
"
//////////GQ/DQ/GsmQ_addr///////////////
//expert weight addr no need
...
...
@@ -84,22 +103,6 @@
" buffer_load_dword v20, v8, s[40:43], 0 offen
\n
"
" buffer_load_dword v21, v9, s[40:43], 0 offen
\n
"
"s_mov_b32 s16, %[s_res_dq0]
\n
"
"s_mov_b32 s17, %[s_res_dq1]
\n
"
"s_mov_b32 s18, %[s_res_dq2]
\n
"
"s_mov_b32 s19, %[s_res_dq3]
\n
"
"s_mov_b32 s32, %[s_res_gq0]
\n
"
"s_mov_b32 s33, %[s_res_gq1]
\n
"
"s_mov_b32 s34, %[s_res_gq2]
\n
"
"s_mov_b32 s35, %[s_res_gq3]
\n
"
"s_mov_b32 s20, %[s_res_a0]
\n
"
"s_mov_b32 s21, %[s_res_a1]
\n
"
"s_mov_b32 s22, %[s_res_a2]
\n
"
"s_mov_b32 s23, %[s_res_a3]
\n
"
"s_mov_b32 s24, %[s_res_b0]
\n
"
"s_mov_b32 s25, %[s_res_b1]
\n
"
"s_mov_b32 s26, %[s_res_b2]
\n
"
"s_mov_b32 s27, %[s_res_b3]
\n
"
" s_mov_b32 s80, 0
\n
"
//---------------------v26-33 no need
// "s_nop 4\n"
...
...
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp
View file @
f549173b
...
...
@@ -180,6 +180,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
/////////////
index_t
a_scale_expert_stride_0
=
kargs
.
hidden_size
;
index_t
g_scale_expert_stride_0
=
shared_intermediate_size_0
;
index_t
smq_scale_expert_stride_0
=
shared_intermediate_size_0
;
index_t
d_scale_expert_stride_1
=
kargs
.
hidden_size
;
// nr*kr*w
index_t
interm_idx_nr0
=
__builtin_amdgcn_readfirstlane
(
...
...
@@ -244,12 +245,12 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
number
<
decltype
(
g_win
)
::
NumAccess_NonLinear
>
{});
//////gq
auto
gq_win
=
[
&
]()
{
const
GDataType
*
g_ptr
=
reinterpret_cast
<
const
GScaleDataType
*>
(
kargs
.
g_scale_ptr
)
+
const
G
Scale
DataType
*
g
q
_ptr
=
reinterpret_cast
<
const
GScaleDataType
*>
(
kargs
.
g_scale_ptr
)
+
static_cast
<
long_index_t
>
(
expert_id
)
*
g_scale_expert_stride_0
+
intermediate_tile_id
*
BlockShape
::
Block_N0
;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline
auto
g_view_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
g_ptr
,
auto
g
q
_view_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
g
q
_ptr
,
make_tuple
(
shared_intermediate_size_1
),
number
<
1
>
{});
...
...
@@ -257,7 +258,22 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
}();
auto
gq_res
=
gq_win
.
get_buffer_view
().
cached_buf_res_
;
////
////smQ
auto
smq_win
=
[
&
]()
{
const
YSmoothScaleDataType
*
smq_ptr
=
reinterpret_cast
<
const
YSmoothScaleDataType
*>
(
kargs
.
y_smooth_scale_ptr
)
+
static_cast
<
long_index_t
>
(
expert_id
)
*
smq_scale_expert_stride_0
+
intermediate_tile_id
*
BlockShape
::
Block_N0
;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline
auto
smq_view_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
smq_ptr
,
make_tuple
(
shared_intermediate_size_1
),
number
<
1
>
{});
return
smq_view_
;
}();
auto
smq_res
=
smq_win
.
get_buffer_view
().
cached_buf_res_
;
/////////////////////
const
auto
d_win
=
[
&
]()
{
const
DDataType
*
d_ptr
=
reinterpret_cast
<
const
DDataType
*>
(
kargs
.
d_ptr
)
+
static_cast
<
long_index_t
>
(
expert_id
)
*
expert_stride_1
+
...
...
@@ -284,8 +300,9 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto
d_res
=
d_win
.
get_bottom_tensor_view
().
get_buffer_view
().
cached_buf_res_
;
//////gq
auto
dq_win
=
[
&
]()
{
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.d_scale_ptr) + static_cast<long_index_t>(expert_id) * d_scale_expert_stride_0;
const
GDataType
*
g_ptr
=
reinterpret_cast
<
const
GScaleDataType
*>
(
kargs
.
d_scale_ptr
)
//remember to add expert_id as expert_idx
const
DScaleDataType
*
g_ptr
=
reinterpret_cast
<
const
DScaleDataType
*>
(
kargs
.
d_scale_ptr
)
+
static_cast
<
long_index_t
>
(
expert_id
)
*
d_scale_expert_stride_1
;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.d_scale_ptr)//remember to add expert_id as expert_idx
auto
g_view_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
g_ptr
,
make_tuple
(
kargs
.
hidden_size
),
...
...
@@ -368,7 +385,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
kargs
.
hidden_size
,
BlockShape
::
Block_K0
,
// tile offset for B matrix each unroll
BlockShape
::
Block_Kr0
*
BlockShape
::
Block_W0
);
// tile offset for B matrix each unroll
BlockShape
::
Block_W0
);
// tile offset for B matrix each unroll
// sweep_tile(
// acc_0,
...
...
@@ -396,6 +413,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
smem
,
kargs
.
hidden_size
,
// total n number
w_scale
,
BlockShape
::
Block_N1
,
shared_intermediate_size_1
*
Block_N1
-
kr_1
*
BlockShape
::
Block_W1
,
// along N
kr_1
*
BlockShape
::
Block_W1
,
BlockShape
::
Block_N1
);
// along N
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment