Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
26d84960
Commit
26d84960
authored
Jan 03, 2025
by
shengnxu
Browse files
change some inline parameter style
parent
d0c80b12
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
386 additions
and
344 deletions
+386
-344
include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
...ps/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
+129
-88
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
...lock/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
+257
-256
No files found.
include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
View file @
26d84960
...
@@ -384,22 +384,70 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
...
@@ -384,22 +384,70 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
#include "uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc"
#include "uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc"
#undef CK_TILE_FLATMM_UK_MFMA
#undef CK_TILE_FLATMM_UK_MFMA
:
[
s_loop_cnt
]
"+s"
(
loop_cnt
),
:
[
s_loop_cnt
]
"+s"
(
loop_cnt
),
// [v_acc_0]"+v"(v_acc[0]),
[
c0
]
"+v"
(
v_z0
),
// [v_acc_1]"+v"(v_acc[1]),
[
c1
]
"+v"
(
v_z1
),
// [v_acc_2]"+v"(v_acc[2]),
[
c2
]
"+v"
(
v_z2
),
// [v_acc_3]"+v"(v_acc[3]),
[
c3
]
"+v"
(
v_z3
),
// [v_acc_4]"+v"(v_acc[4]),
[
c4
]
"+v"
(
v_z4
),
// [v_acc_5]"+v"(v_acc[5]),
[
c5
]
"+v"
(
v_z5
),
// [v_acc_6]"+v"(v_acc[6]),
[
c6
]
"+v"
(
v_z6
),
// [v_acc_7]"+v"(v_acc[7]),
[
c7
]
"+v"
(
v_z7
),
// [v_acc_8]"+v"(v_acc[8]),
[
c8
]
"+v"
(
v_z8
),
// [v_acc_9]"+v"(v_acc[9]),
[
c9
]
"+v"
(
v_z9
),
// [v_acc_10]"+v"(v_acc[10]),
[
c10
]
"+v"
(
v_z10
),
// [v_acc_11]"+v"(v_acc[11]),
[
c11
]
"+v"
(
v_z11
),
// [v_acc_12]"+v"(v_acc[12]),
[
c12
]
"+v"
(
v_z12
),
// [v_acc_13]"+v"(v_acc[13]),
[
c13
]
"+v"
(
v_z13
),
// [v_acc_14]"+v"(v_acc[14]),
[
c14
]
"+v"
(
v_z14
),
// [v_acc_15]"+v"(v_acc[15]),
[
c15
]
"+v"
(
v_z15
),
[
c16
]
"+v"
(
v_z16
),
[
c17
]
"+v"
(
v_z17
),
[
c18
]
"+v"
(
v_z18
),
[
c19
]
"+v"
(
v_z19
),
[
c20
]
"+v"
(
v_z20
),
[
c21
]
"+v"
(
v_z21
),
[
c22
]
"+v"
(
v_z22
),
[
c23
]
"+v"
(
v_z23
),
[
c24
]
"+v"
(
v_z24
),
[
c25
]
"+v"
(
v_z25
),
[
c26
]
"+v"
(
v_z26
),
[
c27
]
"+v"
(
v_z27
),
[
c28
]
"+v"
(
v_z28
),
[
c29
]
"+v"
(
v_z29
),
[
c30
]
"+v"
(
v_z30
),
[
c31
]
"+v"
(
v_z31
),
[
c32
]
"+v"
(
v_z32
),
[
c33
]
"+v"
(
v_z33
),
[
c34
]
"+v"
(
v_z34
),
[
c35
]
"+v"
(
v_z35
),
[
c36
]
"+v"
(
v_z36
),
[
c37
]
"+v"
(
v_z37
),
[
c38
]
"+v"
(
v_z38
),
[
c39
]
"+v"
(
v_z39
),
[
c40
]
"+v"
(
v_z40
),
[
c41
]
"+v"
(
v_z41
),
[
c42
]
"+v"
(
v_z42
),
[
c43
]
"+v"
(
v_z43
),
[
c44
]
"+v"
(
v_z44
),
[
c45
]
"+v"
(
v_z45
),
[
c46
]
"+v"
(
v_z46
),
[
c47
]
"+v"
(
v_z47
),
[
c48
]
"+v"
(
v_z48
),
[
c49
]
"+v"
(
v_z49
),
[
c50
]
"+v"
(
v_z50
),
[
c51
]
"+v"
(
v_z51
),
[
c52
]
"+v"
(
v_z52
),
[
c53
]
"+v"
(
v_z53
),
[
c54
]
"+v"
(
v_z54
),
[
c55
]
"+v"
(
v_z55
),
[
c56
]
"+v"
(
v_z56
),
[
c57
]
"+v"
(
v_z57
),
[
c58
]
"+v"
(
v_z58
),
[
c59
]
"+v"
(
v_z59
),
[
c60
]
"+v"
(
v_z60
),
[
c61
]
"+v"
(
v_z61
),
[
c62
]
"+v"
(
v_z62
),
[
c63
]
"+v"
(
v_z63
),
[
v_token_id0
]
"+v"
(
temp0
),
[
v_token_id0
]
"+v"
(
temp0
),
[
v_token_id1
]
"+v"
(
temp1
),
[
v_token_id1
]
"+v"
(
temp1
),
[
s_mem_
]
"+r"
(
smem
)
[
s_mem_
]
"+r"
(
smem
)
...
@@ -533,81 +581,74 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
...
@@ -533,81 +581,74 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
);
);
// clang-format on
// clang-format on
#pragma clang diagnostic pop
#pragma clang diagnostic pop
int32x4_t
v_acc
[
16
]{
0
};
v_acc
[
0
][
0
]
=
v_z0
;
v_acc
[
0
][
1
]
=
v_z1
;
v_acc
[
0
][
2
]
=
v_z2
;
v_acc
[
0
][
3
]
=
v_z3
;
v_acc
[
1
][
0
]
=
v_z4
;
v_acc
[
1
][
1
]
=
v_z5
;
v_acc
[
1
][
2
]
=
v_z6
;
v_acc
[
1
][
3
]
=
v_z7
;
v_acc
[
2
][
0
]
=
v_z8
;
v_acc
[
2
][
1
]
=
v_z9
;
v_acc
[
2
][
2
]
=
v_z10
;
v_acc
[
2
][
3
]
=
v_z11
;
v_acc
[
3
][
0
]
=
v_z12
;
v_acc
[
3
][
1
]
=
v_z13
;
v_acc
[
3
][
2
]
=
v_z14
;
v_acc
[
3
][
3
]
=
v_z15
;
v_acc
[
4
][
0
]
=
v_z16
;
v_acc
[
4
][
1
]
=
v_z17
;
v_acc
[
4
][
2
]
=
v_z18
;
v_acc
[
4
][
3
]
=
v_z19
;
v_acc
[
5
][
0
]
=
v_z20
;
v_acc
[
5
][
1
]
=
v_z21
;
v_acc
[
5
][
2
]
=
v_z22
;
v_acc
[
5
][
3
]
=
v_z23
;
v_acc
[
6
][
0
]
=
v_z24
;
v_acc
[
6
][
1
]
=
v_z25
;
v_acc
[
6
][
2
]
=
v_z26
;
v_acc
[
6
][
3
]
=
v_z27
;
v_acc
[
7
][
0
]
=
v_z28
;
v_acc
[
7
][
1
]
=
v_z29
;
v_acc
[
7
][
2
]
=
v_z30
;
v_acc
[
7
][
3
]
=
v_z31
;
v_acc
[
8
][
0
]
=
v_z32
;
v_acc
[
8
][
1
]
=
v_z33
;
v_acc
[
8
][
2
]
=
v_z34
;
v_acc
[
8
][
3
]
=
v_z35
;
v_acc
[
9
][
0
]
=
v_z36
;
v_acc
[
9
][
1
]
=
v_z37
;
v_acc
[
9
][
2
]
=
v_z38
;
v_acc
[
9
][
3
]
=
v_z39
;
v_acc
[
10
][
0
]
=
v_z40
;
v_acc
[
10
][
1
]
=
v_z41
;
v_acc
[
10
][
2
]
=
v_z42
;
v_acc
[
10
][
3
]
=
v_z43
;
v_acc
[
11
][
0
]
=
v_z44
;
v_acc
[
11
][
1
]
=
v_z45
;
v_acc
[
11
][
2
]
=
v_z46
;
v_acc
[
11
][
3
]
=
v_z47
;
v_acc
[
12
][
0
]
=
v_z48
;
v_acc
[
12
][
1
]
=
v_z49
;
v_acc
[
12
][
2
]
=
v_z50
;
v_acc
[
12
][
3
]
=
v_z51
;
v_acc
[
13
][
0
]
=
v_z52
;
v_acc
[
13
][
1
]
=
v_z53
;
v_acc
[
13
][
2
]
=
v_z54
;
v_acc
[
13
][
3
]
=
v_z55
;
v_acc
[
14
][
0
]
=
v_z56
;
v_acc
[
14
][
1
]
=
v_z57
;
v_acc
[
14
][
2
]
=
v_z58
;
v_acc
[
14
][
3
]
=
v_z59
;
v_acc
[
15
][
0
]
=
v_z60
;
v_acc
[
15
][
1
]
=
v_z61
;
v_acc
[
15
][
2
]
=
v_z62
;
v_acc
[
15
][
3
]
=
v_z63
;
// return local scratch
// return local scratch
auto
c
=
MakeCBlockTile
();
auto
c
=
MakeCBlockTile
();
for
(
auto
i
=
0
;
i
<
16
;
i
++
)
c
.
get_thread_buffer
()[
0
]
=
v_z0
;
{
c
.
get_thread_buffer
()[
1
]
=
v_z1
;
c
.
get_thread_buffer
()[
4
*
i
+
0
]
=
v_acc
[
i
].
x
;
c
.
get_thread_buffer
()[
2
]
=
v_z2
;
c
.
get_thread_buffer
()[
4
*
i
+
1
]
=
v_acc
[
i
].
y
;
c
.
get_thread_buffer
()[
3
]
=
v_z3
;
c
.
get_thread_buffer
()[
4
*
i
+
2
]
=
v_acc
[
i
].
z
;
c
.
get_thread_buffer
()[
4
]
=
v_z4
;
c
.
get_thread_buffer
()[
4
*
i
+
3
]
=
v_acc
[
i
].
w
;
c
.
get_thread_buffer
()[
5
]
=
v_z5
;
}
c
.
get_thread_buffer
()[
6
]
=
v_z6
;
c
.
get_thread_buffer
()[
7
]
=
v_z7
;
c
.
get_thread_buffer
()[
8
]
=
v_z8
;
c
.
get_thread_buffer
()[
9
]
=
v_z9
;
c
.
get_thread_buffer
()[
10
]
=
v_z10
;
c
.
get_thread_buffer
()[
11
]
=
v_z11
;
c
.
get_thread_buffer
()[
12
]
=
v_z12
;
c
.
get_thread_buffer
()[
13
]
=
v_z13
;
c
.
get_thread_buffer
()[
14
]
=
v_z14
;
c
.
get_thread_buffer
()[
15
]
=
v_z15
;
c
.
get_thread_buffer
()[
16
]
=
v_z16
;
c
.
get_thread_buffer
()[
17
]
=
v_z17
;
c
.
get_thread_buffer
()[
18
]
=
v_z18
;
c
.
get_thread_buffer
()[
19
]
=
v_z19
;
c
.
get_thread_buffer
()[
20
]
=
v_z20
;
c
.
get_thread_buffer
()[
21
]
=
v_z21
;
c
.
get_thread_buffer
()[
22
]
=
v_z22
;
c
.
get_thread_buffer
()[
23
]
=
v_z23
;
c
.
get_thread_buffer
()[
24
]
=
v_z24
;
c
.
get_thread_buffer
()[
25
]
=
v_z25
;
c
.
get_thread_buffer
()[
26
]
=
v_z26
;
c
.
get_thread_buffer
()[
27
]
=
v_z27
;
c
.
get_thread_buffer
()[
28
]
=
v_z28
;
c
.
get_thread_buffer
()[
29
]
=
v_z29
;
c
.
get_thread_buffer
()[
30
]
=
v_z30
;
c
.
get_thread_buffer
()[
31
]
=
v_z31
;
c
.
get_thread_buffer
()[
32
]
=
v_z32
;
c
.
get_thread_buffer
()[
33
]
=
v_z33
;
c
.
get_thread_buffer
()[
34
]
=
v_z34
;
c
.
get_thread_buffer
()[
35
]
=
v_z35
;
c
.
get_thread_buffer
()[
36
]
=
v_z36
;
c
.
get_thread_buffer
()[
37
]
=
v_z37
;
c
.
get_thread_buffer
()[
38
]
=
v_z38
;
c
.
get_thread_buffer
()[
39
]
=
v_z39
;
c
.
get_thread_buffer
()[
40
]
=
v_z40
;
c
.
get_thread_buffer
()[
41
]
=
v_z41
;
c
.
get_thread_buffer
()[
42
]
=
v_z42
;
c
.
get_thread_buffer
()[
43
]
=
v_z43
;
c
.
get_thread_buffer
()[
44
]
=
v_z44
;
c
.
get_thread_buffer
()[
45
]
=
v_z45
;
c
.
get_thread_buffer
()[
46
]
=
v_z46
;
c
.
get_thread_buffer
()[
47
]
=
v_z47
;
c
.
get_thread_buffer
()[
48
]
=
v_z48
;
c
.
get_thread_buffer
()[
49
]
=
v_z49
;
c
.
get_thread_buffer
()[
50
]
=
v_z50
;
c
.
get_thread_buffer
()[
51
]
=
v_z51
;
c
.
get_thread_buffer
()[
52
]
=
v_z52
;
c
.
get_thread_buffer
()[
53
]
=
v_z53
;
c
.
get_thread_buffer
()[
54
]
=
v_z54
;
c
.
get_thread_buffer
()[
55
]
=
v_z55
;
c
.
get_thread_buffer
()[
56
]
=
v_z56
;
c
.
get_thread_buffer
()[
57
]
=
v_z57
;
c
.
get_thread_buffer
()[
58
]
=
v_z58
;
c
.
get_thread_buffer
()[
59
]
=
v_z59
;
c
.
get_thread_buffer
()[
60
]
=
v_z60
;
c
.
get_thread_buffer
()[
61
]
=
v_z61
;
c
.
get_thread_buffer
()[
62
]
=
v_z62
;
c
.
get_thread_buffer
()[
63
]
=
v_z63
;
return
c
;
return
c
;
}
}
};
};
...
...
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
View file @
26d84960
...
@@ -189,193 +189,193 @@
...
@@ -189,193 +189,193 @@
" label_start:
\n
"
" label_start:
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
" s_barrier
\n
"
_UK_MFMA_
"
v[128:131], acc[0:1], v[192:193], v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[192:193], [%[c0], %[c1], %[c2], %[c3]
]
\n
"
_UK_MFMA_
"
v[128:131], acc[2:3], v[194:195], v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[194:195], [%[c0], %[c1], %[c2], %[c3]
]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[128:131], acc[4:5], v[196:197], v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[196:197], [%[c0], %[c1], %[c2], %[c3]
]
\n
"
_UK_MFMA_
"
v[128:131], acc[6:7], v[198:199], v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[198:199], [%[c0], %[c1], %[c2], %[c3]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[128:131], acc[8:9], v[200:201], v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[200:201], [%[c0], %[c1], %[c2], %[c3]
]
\n
"
_UK_MFMA_
"
v[128:131], acc[10:11], v[202:203], v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[202:203], [%[c0], %[c1], %[c2], %[c3]
]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[128:131], acc[12:13], v[204:205], v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[204:205], [%[c0], %[c1], %[c2], %[c3]
]
\n
"
_UK_MFMA_
"
v[128:131], acc[14:15], v[206:207], v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[206:207], [%[c0], %[c1], %[c2], %[c3]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[132:135], acc[0:1], v[208:209], v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[208:209], [%[c4], %[c5], %[c6], %[c7]
]
\n
"
_UK_MFMA_
"
v[132:135], acc[2:3], v[210:211], v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[210:211], [%[c4], %[c5], %[c6], %[c7]
]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[132:135], acc[4:5], v[212:213], v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[212:213], [%[c4], %[c5], %[c6], %[c7]
]
\n
"
_UK_MFMA_
"
v[132:135], acc[6:7], v[214:215], v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[214:215], [%[c4], %[c5], %[c6], %[c7]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[132:135], acc[8:9], v[216:217], v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[216:217], [%[c4], %[c5], %[c6], %[c7]
]
\n
"
_UK_MFMA_
"
v[132:135], acc[10:11], v[218:219], v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[218:219], [%[c4], %[c5], %[c6], %[c7]
]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[132:135], acc[12:13], v[220:221], v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[220:221], [%[c4], %[c5], %[c6], %[c7]
]
\n
"
_UK_MFMA_
"
v[132:135], acc[14:15], v[222:223], v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[222:223], [%[c4], %[c5], %[c6], %[c7]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[136:139], acc[16:17], v[192:193], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[192:193], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
_UK_MFMA_
"
v[136:139], acc[18:19], v[194:195], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[194:195], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[136:139], acc[20:21], v[196:197], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[196:197], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
_UK_MFMA_
"
v[136:139], acc[22:23], v[198:199], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[198:199], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[136:139], acc[24:25], v[200:201], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[200:201], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
_UK_MFMA_
"
v[136:139], acc[26:27], v[202:203], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[202:203], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[136:139], acc[28:29], v[204:205], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[204:205], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
_UK_MFMA_
"
v[136:139], acc[30:31], v[206:207], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[206:207], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[140:143], acc[16:17], v[208:209], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[208:209], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
_UK_MFMA_
"
v[140:143], acc[18:19], v[210:211], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[210:211], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[140:143], acc[20:21], v[212:213], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[212:213], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
_UK_MFMA_
"
v[140:143], acc[22:23], v[214:215], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[214:215], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[140:143], acc[24:25], v[216:217], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[216:217], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
_UK_MFMA_
"
v[140:143], acc[26:27], v[218:219], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[218:219], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[140:143], acc[28:29], v[220:221], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[220:221], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
_UK_MFMA_
"
v[140:143], acc[30:31], v[222:223], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[222:223], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
" s_add_u32 m0, %[smem_sz], %[s_m0_init]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
"
v[144:147], acc[32:33], v[192:193], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[32:33], v[192:193], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
_UK_MFMA_
"
v[144:147], acc[34:35], v[194:195], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[34:35], v[194:195], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[144:147], acc[36:37], v[196:197], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[36:37], v[196:197], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
_UK_MFMA_
"
v[144:147], acc[38:39], v[198:199], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[38:39], v[198:199], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
" ds_read_b128 v[224:227], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_0]
\n
"
" ds_read_b128 v[224:227], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_0]
\n
"
_UK_MFMA_
"
v[144:147], acc[40:41], v[200:201], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[40:41], v[200:201], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
_UK_MFMA_
"
v[144:147], acc[42:43], v[202:203], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[42:43], v[202:203], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[144:147], acc[44:45], v[204:205], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[44:45], v[204:205], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
_UK_MFMA_
"
v[144:147], acc[46:47], v[206:207], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[46:47], v[206:207], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
" ds_read_b128 v[228:231], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_1]
\n
"
" ds_read_b128 v[228:231], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_1]
\n
"
_UK_MFMA_
"
v[148:151], acc[32:33], v[208:209], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[32:33], v[208:209], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
_UK_MFMA_
"
v[148:151], acc[34:35], v[210:211], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[34:35], v[210:211], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[148:151], acc[36:37], v[212:213], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[36:37], v[212:213], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
_UK_MFMA_
"
v[148:151], acc[38:39], v[214:215], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[38:39], v[214:215], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
" ds_read_b128 v[232:235], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_2]
\n
"
" ds_read_b128 v[232:235], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_2]
\n
"
_UK_MFMA_
"
v[148:151], acc[40:41], v[216:217], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[40:41], v[216:217], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
_UK_MFMA_
"
v[148:151], acc[42:43], v[218:219], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[42:43], v[218:219], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[148:151], acc[44:45], v[220:221], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[44:45], v[220:221], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
_UK_MFMA_
"
v[148:151], acc[46:47], v[222:223], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[46:47], v[222:223], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
" ds_read_b128 v[236:239], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_3]
\n
"
" ds_read_b128 v[236:239], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_3]
\n
"
_UK_MFMA_
"
v[152:155], acc[48:49], v[192:193], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[48:49], v[192:193], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
_UK_MFMA_
"
v[152:155], acc[50:51], v[194:195], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[50:51], v[194:195], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[152:155], acc[52:53], v[196:197], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[52:53], v[196:197], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
_UK_MFMA_
"
v[152:155], acc[54:55], v[198:199], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[54:55], v[198:199], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
" ds_read_b128 v[240:243], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_4]
\n
"
" ds_read_b128 v[240:243], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_4]
\n
"
_UK_MFMA_
"
v[152:155], acc[56:57], v[200:201], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[56:57], v[200:201], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
_UK_MFMA_
"
v[152:155], acc[58:59], v[202:203], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[58:59], v[202:203], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[152:155], acc[60:61], v[204:205], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[60:61], v[204:205], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
_UK_MFMA_
"
v[152:155], acc[62:63], v[206:207], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[62:63], v[206:207], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
" ds_read_b128 v[244:247], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_5]
\n
"
" ds_read_b128 v[244:247], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_5]
\n
"
_UK_MFMA_
"
v[156:159], acc[48:49], v[208:209], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[48:49], v[208:209], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
_UK_MFMA_
"
v[156:159], acc[50:51], v[210:211], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[50:51], v[210:211], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[156:159], acc[52:53], v[212:213], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[52:53], v[212:213], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
_UK_MFMA_
"
v[156:159], acc[54:55], v[214:215], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[54:55], v[214:215], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
" ds_read_b128 v[248:251], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_6]
\n
"
" ds_read_b128 v[248:251], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_6]
\n
"
_UK_MFMA_
"
v[156:159], acc[56:57], v[216:217], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[56:57], v[216:217], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
_UK_MFMA_
"
v[156:159], acc[58:59], v[218:219], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[58:59], v[218:219], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[156:159], acc[60:61], v[220:221], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[60:61], v[220:221], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
_UK_MFMA_
"
v[156:159], acc[62:63], v[222:223], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[62:63], v[222:223], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
" ds_read_b128 v[252:255], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_7]
\n
"
" ds_read_b128 v[252:255], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
"
v[160:163], acc[64:65], v[192:193], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[64:65], v[192:193], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[66:67], v[194:195], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[66:67], v[194:195], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[160:163], acc[68:69], v[196:197], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[68:69], v[196:197], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[70:71], v[198:199], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[70:71], v[198:199], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[72:73], v[200:201], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[72:73], v[200:201], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[74:75], v[202:203], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[74:75], v[202:203], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[160:163], acc[76:77], v[204:205], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[76:77], v[204:205], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[78:79], v[206:207], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[78:79], v[206:207], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[64:65], v[208:209], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[64:65], v[208:209], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[66:67], v[210:211], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[66:67], v[210:211], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[164:167], acc[68:69], v[212:213], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[68:69], v[212:213], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[70:71], v[214:215], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[70:71], v[214:215], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[72:73], v[216:217], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[72:73], v[216:217], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[74:75], v[218:219], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[74:75], v[218:219], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[164:167], acc[76:77], v[220:221], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[76:77], v[220:221], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[78:79], v[222:223], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[78:79], v[222:223], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[80:81], v[192:193], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[80:81], v[192:193], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[82:83], v[194:195], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[82:83], v[194:195], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[168:171], acc[84:85], v[196:197], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[84:85], v[196:197], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[86:87], v[198:199], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[86:87], v[198:199], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[88:89], v[200:201], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[88:89], v[200:201], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[90:91], v[202:203], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[90:91], v[202:203], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[168:171], acc[92:93], v[204:205], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[92:93], v[204:205], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[94:95], v[206:207], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[94:95], v[206:207], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[80:81], v[208:209], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[80:81], v[208:209], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[82:83], v[210:211], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[82:83], v[210:211], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[172:175], acc[84:85], v[212:213], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[84:85], v[212:213], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[86:87], v[214:215], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[86:87], v[214:215], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[88:89], v[216:217], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[88:89], v[216:217], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[90:91], v[218:219], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[90:91], v[218:219], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[172:175], acc[92:93], v[220:221], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[92:93], v[220:221], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[94:95], v[222:223], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[94:95], v[222:223], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
"
v[176:179], acc[96:97], v[192:193], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[96:97], v[192:193], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[98:99], v[194:195], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[98:99], v[194:195], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[176:179], acc[100:101], v[196:197], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[100:101], v[196:197], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[102:103], v[198:199], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[102:103], v[198:199], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[104:105], v[200:201], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[104:105], v[200:201], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[106:107], v[202:203], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[106:107], v[202:203], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[176:179], acc[108:109], v[204:205], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[108:109], v[204:205], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[110:111], v[206:207], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[110:111], v[206:207], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[96:97], v[208:209], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[96:97], v[208:209], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[98:99], v[210:211], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[98:99], v[210:211], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[180:183], acc[100:101], v[212:213], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[100:101], v[212:213], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[102:103], v[214:215], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[102:103], v[214:215], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[104:105], v[216:217], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[104:105], v[216:217], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[106:107], v[218:219], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[106:107], v[218:219], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[180:183], acc[108:109], v[220:221], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[108:109], v[220:221], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[110:111], v[222:223], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[110:111], v[222:223], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[112:113], v[192:193], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[112:113], v[192:193], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[114:115], v[194:195], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[114:115], v[194:195], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[184:187], acc[116:117], v[196:197], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[116:117], v[196:197], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[118:119], v[198:199], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[118:119], v[198:199], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[120:121], v[200:201], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[120:121], v[200:201], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[122:123], v[202:203], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[122:123], v[202:203], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[184:187], acc[124:125], v[204:205], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[124:125], v[204:205], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[126:127], v[206:207], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[126:127], v[206:207], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[112:113], v[208:209], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[112:113], v[208:209], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[114:115], v[210:211], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[114:115], v[210:211], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[188:191], acc[116:117], v[212:213], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[116:117], v[212:213], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[118:119], v[214:215], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[118:119], v[214:215], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[120:121], v[216:217], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[120:121], v[216:217], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[122:123], v[218:219], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[122:123], v[218:219], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[188:191], acc[124:125], v[220:221], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[124:125], v[220:221], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[126:127], v[222:223], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[126:127], v[222:223], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
" s_add_u32 s60, 0x00000300, s80
\n
"
" s_add_u32 s60, 0x00000300, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s57, s57, 0
\n
"
" s_cselect_b32 s57, s57, 0
\n
"
...
@@ -391,193 +391,193 @@
...
@@ -391,193 +391,193 @@
" s_cbranch_scc0 label_end
\n
"
" s_cbranch_scc0 label_end
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
" s_barrier
\n
"
_UK_MFMA_
"
v[128:131
], acc[128:129], v[224:225],
v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]
], acc[128:129], v[224:225],
[%[c0], %[c1], %[c2], %[c3]
]
\n
"
_UK_MFMA_
"
v[128:131
], acc[130:131], v[226:227],
v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]
], acc[130:131], v[226:227],
[%[c0], %[c1], %[c2], %[c3]
]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[128:131
], acc[132:133], v[228:229],
v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]
], acc[132:133], v[228:229],
[%[c0], %[c1], %[c2], %[c3]
]
\n
"
_UK_MFMA_
"
v[128:131
], acc[134:135], v[230:231],
v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]
], acc[134:135], v[230:231],
[%[c0], %[c1], %[c2], %[c3]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[128:131
], acc[136:137], v[232:233],
v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]
], acc[136:137], v[232:233],
[%[c0], %[c1], %[c2], %[c3]
]
\n
"
_UK_MFMA_
"
v[128:131
], acc[138:139], v[234:235],
v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]
], acc[138:139], v[234:235],
[%[c0], %[c1], %[c2], %[c3]
]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[128:131
], acc[140:141], v[236:237],
v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]
], acc[140:141], v[236:237],
[%[c0], %[c1], %[c2], %[c3]
]
\n
"
_UK_MFMA_
"
v[128:131
], acc[142:143], v[238:239],
v[128:131
]
\n
"
_UK_MFMA_
"
[%[c0], %[c1], %[c2], %[c3]
], acc[142:143], v[238:239],
[%[c0], %[c1], %[c2], %[c3]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[132:135
], acc[128:129], v[240:241],
v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]
], acc[128:129], v[240:241],
[%[c4], %[c5], %[c6], %[c7]
]
\n
"
_UK_MFMA_
"
v[132:135
], acc[130:131], v[242:243],
v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]
], acc[130:131], v[242:243],
[%[c4], %[c5], %[c6], %[c7]
]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[132:135
], acc[132:133], v[244:245],
v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]
], acc[132:133], v[244:245],
[%[c4], %[c5], %[c6], %[c7]
]
\n
"
_UK_MFMA_
"
v[132:135
], acc[134:135], v[246:247],
v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]
], acc[134:135], v[246:247],
[%[c4], %[c5], %[c6], %[c7]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[132:135
], acc[136:137], v[248:249],
v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]
], acc[136:137], v[248:249],
[%[c4], %[c5], %[c6], %[c7]
]
\n
"
_UK_MFMA_
"
v[132:135
], acc[138:139], v[250:251],
v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]
], acc[138:139], v[250:251],
[%[c4], %[c5], %[c6], %[c7]
]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[132:135
], acc[140:141], v[252:253],
v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]
], acc[140:141], v[252:253],
[%[c4], %[c5], %[c6], %[c7]
]
\n
"
_UK_MFMA_
"
v[132:135
], acc[142:143], v[254:255],
v[132:135
]
\n
"
_UK_MFMA_
"
[%[c4], %[c5], %[c6], %[c7]
], acc[142:143], v[254:255],
[%[c4], %[c5], %[c6], %[c7]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[136:139], acc[144:145], v[224:225], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[144:145], v[224:225], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
_UK_MFMA_
"
v[136:139], acc[146:147], v[226:227], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[146:147], v[226:227], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[136:139], acc[148:149], v[228:229], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[148:149], v[228:229], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
_UK_MFMA_
"
v[136:139], acc[150:151], v[230:231], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[150:151], v[230:231], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[136:139], acc[152:153], v[232:233], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[152:153], v[232:233], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
_UK_MFMA_
"
v[136:139], acc[154:155], v[234:235], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[154:155], v[234:235], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[136:139], acc[156:157], v[236:237], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[156:157], v[236:237], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
_UK_MFMA_
"
v[136:139], acc[158:159], v[238:239], v[136:139
]
\n
"
_UK_MFMA_
"
[%[c8], %[c9], %[c10], %[c11]], acc[158:159], v[238:239], [%[c8], %[c9], %[c10], %[c11]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[140:143], acc[144:145], v[240:241], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[144:145], v[240:241], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
_UK_MFMA_
"
v[140:143], acc[146:147], v[242:243], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[146:147], v[242:243], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[140:143], acc[148:149], v[244:245], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[148:149], v[244:245], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
_UK_MFMA_
"
v[140:143], acc[150:151], v[246:247], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[150:151], v[246:247], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
" s_add_u32 m0, %[s_size_per_issue], m0
\n
"
_UK_MFMA_
"
v[140:143], acc[152:153], v[248:249], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[152:153], v[248:249], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
_UK_MFMA_
"
v[140:143], acc[154:155], v[250:251], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[154:155], v[250:251], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[140:143], acc[156:157], v[252:253], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[156:157], v[252:253], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
_UK_MFMA_
"
v[140:143], acc[158:159], v[254:255], v[140:143
]
\n
"
_UK_MFMA_
"
[%[c12], %[c13], %[c14], %[c15]], acc[158:159], v[254:255], [%[c12], %[c13], %[c14], %[c15]
]
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" buffer_load_dword s[20:23], 0 offen lds
\n
"
" s_add_u32 m0, 0, %[s_m0_init]
\n
"
" s_add_u32 m0, 0, %[s_m0_init]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
"
v[144:147], acc[160:161], v[224:225], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[224:225], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
_UK_MFMA_
"
v[144:147], acc[162:163], v[226:227], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[226:227], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[144:147], acc[164:165], v[228:229], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[228:229], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
_UK_MFMA_
"
v[144:147], acc[166:167], v[230:231], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[230:231], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
" ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
" ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
_UK_MFMA_
"
v[144:147], acc[168:169], v[232:233], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[232:233], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
_UK_MFMA_
"
v[144:147], acc[170:171], v[234:235], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[234:235], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[144:147], acc[172:173], v[236:237], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[236:237], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
_UK_MFMA_
"
v[144:147], acc[174:175], v[238:239], v[144:147
]
\n
"
_UK_MFMA_
"
[%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[238:239], [%[c16], %[c17], %[c18], %[c19]
]
\n
"
" ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
" ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
_UK_MFMA_
"
v[148:151], acc[160:161], v[240:241], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[240:241], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
_UK_MFMA_
"
v[148:151], acc[162:163], v[242:243], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[242:243], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[148:151], acc[164:165], v[244:245], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[244:245], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
_UK_MFMA_
"
v[148:151], acc[166:167], v[246:247], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[246:247], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
" ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
" ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
_UK_MFMA_
"
v[148:151], acc[168:169], v[248:249], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[248:249], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
_UK_MFMA_
"
v[148:151], acc[170:171], v[250:251], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[250:251], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[148:151], acc[172:173], v[252:253], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[252:253], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
_UK_MFMA_
"
v[148:151], acc[174:175], v[254:255], v[148:151
]
\n
"
_UK_MFMA_
"
[%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[254:255], [%[c20], %[c21], %[c22], %[c23]
]
\n
"
" ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
" ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
_UK_MFMA_
"
v[152:155], acc[176:177], v[224:225], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[224:225], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
_UK_MFMA_
"
v[152:155], acc[178:179], v[226:227], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[226:227], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[152:155], acc[180:181], v[228:229], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[228:229], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
_UK_MFMA_
"
v[152:155], acc[182:183], v[230:231], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[230:231], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
" ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
" ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
_UK_MFMA_
"
v[152:155], acc[184:185], v[232:233], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[232:233], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
_UK_MFMA_
"
v[152:155], acc[186:187], v[234:235], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[234:235], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[152:155], acc[188:189], v[236:237], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[236:237], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
_UK_MFMA_
"
v[152:155], acc[190:191], v[238:239], v[152:155
]
\n
"
_UK_MFMA_
"
[%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[238:239], [%[c24], %[c25], %[c26], %[c27]
]
\n
"
" ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
" ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
_UK_MFMA_
"
v[156:159], acc[176:177], v[240:241], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[240:241], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
_UK_MFMA_
"
v[156:159], acc[178:179], v[242:243], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[242:243], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[156:159], acc[180:181], v[244:245], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[244:245], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
_UK_MFMA_
"
v[156:159], acc[182:183], v[246:247], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[246:247], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
" ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
" ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
_UK_MFMA_
"
v[156:159], acc[184:185], v[248:249], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[248:249], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
_UK_MFMA_
"
v[156:159], acc[186:187], v[250:251], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[250:251], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[156:159], acc[188:189], v[252:253], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[252:253], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
_UK_MFMA_
"
v[156:159], acc[190:191], v[254:255], v[156:159
]
\n
"
_UK_MFMA_
"
[%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[254:255], [%[c28], %[c29], %[c30], %[c31]
]
\n
"
" ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
" ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
"
v[160:163], acc[192:193], v[224:225], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[192:193], v[224:225], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[194:195], v[226:227], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[194:195], v[226:227], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[160:163], acc[196:197], v[228:229], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[196:197], v[228:229], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[198:199], v[230:231], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[198:199], v[230:231], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[200:201], v[232:233], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[200:201], v[232:233], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[202:203], v[234:235], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[202:203], v[234:235], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[160:163], acc[204:205], v[236:237], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[204:205], v[236:237], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[160:163], acc[206:207], v[238:239], v[160:163
]
\n
"
_UK_MFMA_
"
[%[c32], %[c33], %[c34], %[c35]], acc[206:207], v[238:239], [%[c32], %[c33], %[c34], %[c35]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[192:193], v[240:241], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[192:193], v[240:241], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[194:195], v[242:243], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[194:195], v[242:243], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[164:167], acc[196:197], v[244:245], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[196:197], v[244:245], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[198:199], v[246:247], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[198:199], v[246:247], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[200:201], v[248:249], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[200:201], v[248:249], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[202:203], v[250:251], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[202:203], v[250:251], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[164:167], acc[204:205], v[252:253], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[204:205], v[252:253], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[164:167], acc[206:207], v[254:255], v[164:167
]
\n
"
_UK_MFMA_
"
[%[c36], %[c37], %[c38], %[c39]], acc[206:207], v[254:255], [%[c36], %[c37], %[c38], %[c39]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[208:209], v[224:225], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[208:209], v[224:225], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[210:211], v[226:227], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[210:211], v[226:227], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[168:171], acc[212:213], v[228:229], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[212:213], v[228:229], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[214:215], v[230:231], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[214:215], v[230:231], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[216:217], v[232:233], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[216:217], v[232:233], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[218:219], v[234:235], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[218:219], v[234:235], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[168:171], acc[220:221], v[236:237], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[220:221], v[236:237], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[168:171], acc[222:223], v[238:239], v[168:171
]
\n
"
_UK_MFMA_
"
[%[c40], %[c41], %[c42], %[c43]], acc[222:223], v[238:239], [%[c40], %[c41], %[c42], %[c43]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[208:209], v[240:241], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[208:209], v[240:241], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[210:211], v[242:243], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[210:211], v[242:243], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[172:175], acc[212:213], v[244:245], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[212:213], v[244:245], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[214:215], v[246:247], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[214:215], v[246:247], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[216:217], v[248:249], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[216:217], v[248:249], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[218:219], v[250:251], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[218:219], v[250:251], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[172:175], acc[220:221], v[252:253], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[220:221], v[252:253], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
_UK_MFMA_
"
v[172:175], acc[222:223], v[254:255], v[172:175
]
\n
"
_UK_MFMA_
"
[%[c44], %[c45], %[c46], %[c47]], acc[222:223], v[254:255], [%[c44], %[c45], %[c46], %[c47]
]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
"
v[176:179], acc[224:225], v[224:225], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[224:225], v[224:225], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[226:227], v[226:227], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[226:227], v[226:227], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[176:179], acc[228:229], v[228:229], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[228:229], v[228:229], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[230:231], v[230:231], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[230:231], v[230:231], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[232:233], v[232:233], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[232:233], v[232:233], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[234:235], v[234:235], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[234:235], v[234:235], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[176:179], acc[236:237], v[236:237], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[236:237], v[236:237], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[176:179], acc[238:239], v[238:239], v[176:179
]
\n
"
_UK_MFMA_
"
[%[c48], %[c49], %[c50], %[c51]], acc[238:239], v[238:239], [%[c48], %[c49], %[c50], %[c51]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[224:225], v[240:241], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[224:225], v[240:241], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[226:227], v[242:243], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[226:227], v[242:243], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[180:183], acc[228:229], v[244:245], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[228:229], v[244:245], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[230:231], v[246:247], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[230:231], v[246:247], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[232:233], v[248:249], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[232:233], v[248:249], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[234:235], v[250:251], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[234:235], v[250:251], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[180:183], acc[236:237], v[252:253], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[236:237], v[252:253], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[180:183], acc[238:239], v[254:255], v[180:183
]
\n
"
_UK_MFMA_
"
[%[c52], %[c53], %[c54], %[c55]], acc[238:239], v[254:255], [%[c52], %[c53], %[c54], %[c55]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[240:241], v[224:225], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[240:241], v[224:225], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[242:243], v[226:227], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[242:243], v[226:227], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen
\n
"
_UK_MFMA_
"
v[184:187], acc[244:245], v[228:229], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[244:245], v[228:229], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[246:247], v[230:231], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[246:247], v[230:231], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[248:249], v[232:233], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[248:249], v[232:233], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[250:251], v[234:235], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[250:251], v[234:235], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
"
v[184:187], acc[252:253], v[236:237], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[252:253], v[236:237], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[184:187], acc[254:255], v[238:239], v[184:187
]
\n
"
_UK_MFMA_
"
[%[c56], %[c57], %[c58], %[c59]], acc[254:255], v[238:239], [%[c56], %[c57], %[c58], %[c59]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[240:241], v[240:241], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[240:241], v[240:241], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[242:243], v[242:243], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[242:243], v[242:243], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
"
v[188:191], acc[244:245], v[244:245], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[244:245], v[244:245], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[246:247], v[246:247], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[246:247], v[246:247], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[248:249], v[248:249], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[248:249], v[248:249], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[250:251], v[250:251], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[250:251], v[250:251], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
"
v[188:191], acc[252:253], v[252:253], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[252:253], v[252:253], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
_UK_MFMA_
"
v[188:191], acc[254:255], v[254:255], v[188:191
]
\n
"
_UK_MFMA_
"
[%[c60], %[c61], %[c62], %[c63]], acc[254:255], v[254:255], [%[c60], %[c61], %[c62], %[c63]
]
\n
"
" s_add_u32 s60, 0x00000300, s80
\n
"
" s_add_u32 s60, 0x00000300, s80
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cmp_lt_u32 s60, %[s_loop_cnt]
\n
"
" s_cselect_b32 s57, s57, 0
\n
"
" s_cselect_b32 s57, s57, 0
\n
"
...
@@ -790,3 +790,4 @@
...
@@ -790,3 +790,4 @@
#undef _DEQUAN_CVT_
#undef _DEQUAN_CVT_
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment