Commit 26d84960 authored by shengnxu's avatar shengnxu
Browse files

change some inline parameter style

parent d0c80b12
...@@ -384,22 +384,70 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -384,22 +384,70 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
#include "uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc" #include "uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc"
#undef CK_TILE_FLATMM_UK_MFMA #undef CK_TILE_FLATMM_UK_MFMA
: [s_loop_cnt]"+s"(loop_cnt), : [s_loop_cnt]"+s"(loop_cnt),
// [v_acc_0]"+v"(v_acc[0]), [c0]"+v"(v_z0),
// [v_acc_1]"+v"(v_acc[1]), [c1]"+v"(v_z1),
// [v_acc_2]"+v"(v_acc[2]), [c2]"+v"(v_z2),
// [v_acc_3]"+v"(v_acc[3]), [c3]"+v"(v_z3),
// [v_acc_4]"+v"(v_acc[4]), [c4]"+v"(v_z4),
// [v_acc_5]"+v"(v_acc[5]), [c5]"+v"(v_z5),
// [v_acc_6]"+v"(v_acc[6]), [c6]"+v"(v_z6),
// [v_acc_7]"+v"(v_acc[7]), [c7]"+v"(v_z7),
// [v_acc_8]"+v"(v_acc[8]), [c8]"+v"(v_z8),
// [v_acc_9]"+v"(v_acc[9]), [c9]"+v"(v_z9),
// [v_acc_10]"+v"(v_acc[10]), [c10]"+v"(v_z10),
// [v_acc_11]"+v"(v_acc[11]), [c11]"+v"(v_z11),
// [v_acc_12]"+v"(v_acc[12]), [c12]"+v"(v_z12),
// [v_acc_13]"+v"(v_acc[13]), [c13]"+v"(v_z13),
// [v_acc_14]"+v"(v_acc[14]), [c14]"+v"(v_z14),
// [v_acc_15]"+v"(v_acc[15]), [c15]"+v"(v_z15),
[c16]"+v"(v_z16),
[c17]"+v"(v_z17),
[c18]"+v"(v_z18),
[c19]"+v"(v_z19),
[c20]"+v"(v_z20),
[c21]"+v"(v_z21),
[c22]"+v"(v_z22),
[c23]"+v"(v_z23),
[c24]"+v"(v_z24),
[c25]"+v"(v_z25),
[c26]"+v"(v_z26),
[c27]"+v"(v_z27),
[c28]"+v"(v_z28),
[c29]"+v"(v_z29),
[c30]"+v"(v_z30),
[c31]"+v"(v_z31),
[c32]"+v"(v_z32),
[c33]"+v"(v_z33),
[c34]"+v"(v_z34),
[c35]"+v"(v_z35),
[c36]"+v"(v_z36),
[c37]"+v"(v_z37),
[c38]"+v"(v_z38),
[c39]"+v"(v_z39),
[c40]"+v"(v_z40),
[c41]"+v"(v_z41),
[c42]"+v"(v_z42),
[c43]"+v"(v_z43),
[c44]"+v"(v_z44),
[c45]"+v"(v_z45),
[c46]"+v"(v_z46),
[c47]"+v"(v_z47),
[c48]"+v"(v_z48),
[c49]"+v"(v_z49),
[c50]"+v"(v_z50),
[c51]"+v"(v_z51),
[c52]"+v"(v_z52),
[c53]"+v"(v_z53),
[c54]"+v"(v_z54),
[c55]"+v"(v_z55),
[c56]"+v"(v_z56),
[c57]"+v"(v_z57),
[c58]"+v"(v_z58),
[c59]"+v"(v_z59),
[c60]"+v"(v_z60),
[c61]"+v"(v_z61),
[c62]"+v"(v_z62),
[c63]"+v"(v_z63),
[v_token_id0]"+v"(temp0), [v_token_id0]"+v"(temp0),
[v_token_id1]"+v"(temp1), [v_token_id1]"+v"(temp1),
[s_mem_]"+r"(smem) [s_mem_]"+r"(smem)
...@@ -533,81 +581,74 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -533,81 +581,74 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
); );
// clang-format on // clang-format on
#pragma clang diagnostic pop #pragma clang diagnostic pop
int32x4_t v_acc[16]{0};
v_acc[0][0] = v_z0;
v_acc[0][1] = v_z1;
v_acc[0][2] = v_z2;
v_acc[0][3] = v_z3;
v_acc[1][0] = v_z4;
v_acc[1][1] = v_z5;
v_acc[1][2] = v_z6;
v_acc[1][3] = v_z7;
v_acc[2][0] = v_z8;
v_acc[2][1] = v_z9;
v_acc[2][2] = v_z10;
v_acc[2][3] = v_z11;
v_acc[3][0] = v_z12;
v_acc[3][1] = v_z13;
v_acc[3][2] = v_z14;
v_acc[3][3] = v_z15;
v_acc[4][0] = v_z16;
v_acc[4][1] = v_z17;
v_acc[4][2] = v_z18;
v_acc[4][3] = v_z19;
v_acc[5][0] = v_z20;
v_acc[5][1] = v_z21;
v_acc[5][2] = v_z22;
v_acc[5][3] = v_z23;
v_acc[6][0] = v_z24;
v_acc[6][1] = v_z25;
v_acc[6][2] = v_z26;
v_acc[6][3] = v_z27;
v_acc[7][0] = v_z28;
v_acc[7][1] = v_z29;
v_acc[7][2] = v_z30;
v_acc[7][3] = v_z31;
v_acc[8][0] = v_z32;
v_acc[8][1] = v_z33;
v_acc[8][2] = v_z34;
v_acc[8][3] = v_z35;
v_acc[9][0] = v_z36;
v_acc[9][1] = v_z37;
v_acc[9][2] = v_z38;
v_acc[9][3] = v_z39;
v_acc[10][0] = v_z40;
v_acc[10][1] = v_z41;
v_acc[10][2] = v_z42;
v_acc[10][3] = v_z43;
v_acc[11][0] = v_z44;
v_acc[11][1] = v_z45;
v_acc[11][2] = v_z46;
v_acc[11][3] = v_z47;
v_acc[12][0] = v_z48;
v_acc[12][1] = v_z49;
v_acc[12][2] = v_z50;
v_acc[12][3] = v_z51;
v_acc[13][0] = v_z52;
v_acc[13][1] = v_z53;
v_acc[13][2] = v_z54;
v_acc[13][3] = v_z55;
v_acc[14][0] = v_z56;
v_acc[14][1] = v_z57;
v_acc[14][2] = v_z58;
v_acc[14][3] = v_z59;
v_acc[15][0] = v_z60;
v_acc[15][1] = v_z61;
v_acc[15][2] = v_z62;
v_acc[15][3] = v_z63;
// return local scratch // return local scratch
auto c = MakeCBlockTile(); auto c = MakeCBlockTile();
for(auto i = 0; i < 16; i++) c.get_thread_buffer()[0] = v_z0;
{ c.get_thread_buffer()[1] = v_z1;
c.get_thread_buffer()[4 * i + 0] = v_acc[i].x; c.get_thread_buffer()[2] = v_z2;
c.get_thread_buffer()[4 * i + 1] = v_acc[i].y; c.get_thread_buffer()[3] = v_z3;
c.get_thread_buffer()[4 * i + 2] = v_acc[i].z; c.get_thread_buffer()[4] = v_z4;
c.get_thread_buffer()[4 * i + 3] = v_acc[i].w; c.get_thread_buffer()[5] = v_z5;
} c.get_thread_buffer()[6] = v_z6;
c.get_thread_buffer()[7] = v_z7;
c.get_thread_buffer()[8] = v_z8;
c.get_thread_buffer()[9] = v_z9;
c.get_thread_buffer()[10] = v_z10;
c.get_thread_buffer()[11] = v_z11;
c.get_thread_buffer()[12] = v_z12;
c.get_thread_buffer()[13] = v_z13;
c.get_thread_buffer()[14] = v_z14;
c.get_thread_buffer()[15] = v_z15;
c.get_thread_buffer()[16] = v_z16;
c.get_thread_buffer()[17] = v_z17;
c.get_thread_buffer()[18] = v_z18;
c.get_thread_buffer()[19] = v_z19;
c.get_thread_buffer()[20] = v_z20;
c.get_thread_buffer()[21] = v_z21;
c.get_thread_buffer()[22] = v_z22;
c.get_thread_buffer()[23] = v_z23;
c.get_thread_buffer()[24] = v_z24;
c.get_thread_buffer()[25] = v_z25;
c.get_thread_buffer()[26] = v_z26;
c.get_thread_buffer()[27] = v_z27;
c.get_thread_buffer()[28] = v_z28;
c.get_thread_buffer()[29] = v_z29;
c.get_thread_buffer()[30] = v_z30;
c.get_thread_buffer()[31] = v_z31;
c.get_thread_buffer()[32] = v_z32;
c.get_thread_buffer()[33] = v_z33;
c.get_thread_buffer()[34] = v_z34;
c.get_thread_buffer()[35] = v_z35;
c.get_thread_buffer()[36] = v_z36;
c.get_thread_buffer()[37] = v_z37;
c.get_thread_buffer()[38] = v_z38;
c.get_thread_buffer()[39] = v_z39;
c.get_thread_buffer()[40] = v_z40;
c.get_thread_buffer()[41] = v_z41;
c.get_thread_buffer()[42] = v_z42;
c.get_thread_buffer()[43] = v_z43;
c.get_thread_buffer()[44] = v_z44;
c.get_thread_buffer()[45] = v_z45;
c.get_thread_buffer()[46] = v_z46;
c.get_thread_buffer()[47] = v_z47;
c.get_thread_buffer()[48] = v_z48;
c.get_thread_buffer()[49] = v_z49;
c.get_thread_buffer()[50] = v_z50;
c.get_thread_buffer()[51] = v_z51;
c.get_thread_buffer()[52] = v_z52;
c.get_thread_buffer()[53] = v_z53;
c.get_thread_buffer()[54] = v_z54;
c.get_thread_buffer()[55] = v_z55;
c.get_thread_buffer()[56] = v_z56;
c.get_thread_buffer()[57] = v_z57;
c.get_thread_buffer()[58] = v_z58;
c.get_thread_buffer()[59] = v_z59;
c.get_thread_buffer()[60] = v_z60;
c.get_thread_buffer()[61] = v_z61;
c.get_thread_buffer()[62] = v_z62;
c.get_thread_buffer()[63] = v_z63;
return c; return c;
} }
}; };
......
...@@ -189,193 +189,193 @@ ...@@ -189,193 +189,193 @@
" label_start: \n" " label_start: \n"
" s_waitcnt vmcnt(24) & lgkmcnt(0) \n" " s_waitcnt vmcnt(24) & lgkmcnt(0) \n"
" s_barrier \n" " s_barrier \n"
_UK_MFMA_ " v[128:131], acc[0:1], v[192:193], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[192:193], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " v[128:131], acc[2:3], v[194:195], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[194:195], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[24:27], 0 offen \n"
_UK_MFMA_ " v[128:131], acc[4:5], v[196:197], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[196:197], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " v[128:131], acc[6:7], v[198:199], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[198:199], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[128:131], acc[8:9], v[200:201], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[200:201], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " v[128:131], acc[10:11], v[202:203], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[202:203], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[128:131], acc[12:13], v[204:205], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[204:205], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " v[128:131], acc[14:15], v[206:207], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[206:207], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[132:135], acc[0:1], v[208:209], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " v[132:135], acc[2:3], v[210:211], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[132:135], acc[4:5], v[212:213], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " v[132:135], acc[6:7], v[214:215], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[132:135], acc[8:9], v[216:217], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " v[132:135], acc[10:11], v[218:219], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[132:135], acc[12:13], v[220:221], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " v[132:135], acc[14:15], v[222:223], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[136:139], acc[16:17], v[192:193], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[192:193], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " v[136:139], acc[18:19], v[194:195], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[194:195], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[24:27], 0 offen \n"
_UK_MFMA_ " v[136:139], acc[20:21], v[196:197], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[196:197], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " v[136:139], acc[22:23], v[198:199], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[198:199], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[136:139], acc[24:25], v[200:201], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[200:201], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " v[136:139], acc[26:27], v[202:203], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[202:203], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[136:139], acc[28:29], v[204:205], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[204:205], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " v[136:139], acc[30:31], v[206:207], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[206:207], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[140:143], acc[16:17], v[208:209], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " v[140:143], acc[18:19], v[210:211], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[140:143], acc[20:21], v[212:213], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " v[140:143], acc[22:23], v[214:215], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[140:143], acc[24:25], v[216:217], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " v[140:143], acc[26:27], v[218:219], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[140:143], acc[28:29], v[220:221], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " v[140:143], acc[30:31], v[222:223], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[smem_sz], %[s_m0_init] \n" " s_add_u32 m0, %[smem_sz], %[s_m0_init] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[144:147], acc[32:33], v[192:193], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[32:33], v[192:193], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " v[144:147], acc[34:35], v[194:195], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[34:35], v[194:195], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[24:27], 0 offen \n"
_UK_MFMA_ " v[144:147], acc[36:37], v[196:197], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[36:37], v[196:197], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " v[144:147], acc[38:39], v[198:199], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[38:39], v[198:199], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[224:227], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_0] \n" " ds_read_b128 v[224:227], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_0] \n"
_UK_MFMA_ " v[144:147], acc[40:41], v[200:201], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[40:41], v[200:201], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " v[144:147], acc[42:43], v[202:203], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[42:43], v[202:203], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[144:147], acc[44:45], v[204:205], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[44:45], v[204:205], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " v[144:147], acc[46:47], v[206:207], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[46:47], v[206:207], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[228:231], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_1] \n" " ds_read_b128 v[228:231], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_1] \n"
_UK_MFMA_ " v[148:151], acc[32:33], v[208:209], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[32:33], v[208:209], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " v[148:151], acc[34:35], v[210:211], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[34:35], v[210:211], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[148:151], acc[36:37], v[212:213], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[36:37], v[212:213], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " v[148:151], acc[38:39], v[214:215], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[38:39], v[214:215], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[232:235], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_2] \n" " ds_read_b128 v[232:235], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_2] \n"
_UK_MFMA_ " v[148:151], acc[40:41], v[216:217], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[40:41], v[216:217], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " v[148:151], acc[42:43], v[218:219], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[42:43], v[218:219], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[148:151], acc[44:45], v[220:221], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[44:45], v[220:221], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " v[148:151], acc[46:47], v[222:223], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[46:47], v[222:223], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[236:239], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_3] \n" " ds_read_b128 v[236:239], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_3] \n"
_UK_MFMA_ " v[152:155], acc[48:49], v[192:193], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[48:49], v[192:193], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " v[152:155], acc[50:51], v[194:195], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[50:51], v[194:195], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[24:27], 0 offen \n"
_UK_MFMA_ " v[152:155], acc[52:53], v[196:197], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[52:53], v[196:197], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " v[152:155], acc[54:55], v[198:199], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[54:55], v[198:199], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[240:243], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_4] \n" " ds_read_b128 v[240:243], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_4] \n"
_UK_MFMA_ " v[152:155], acc[56:57], v[200:201], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[56:57], v[200:201], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " v[152:155], acc[58:59], v[202:203], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[58:59], v[202:203], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[152:155], acc[60:61], v[204:205], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[60:61], v[204:205], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " v[152:155], acc[62:63], v[206:207], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[62:63], v[206:207], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[244:247], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_5] \n" " ds_read_b128 v[244:247], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_5] \n"
_UK_MFMA_ " v[156:159], acc[48:49], v[208:209], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[48:49], v[208:209], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " v[156:159], acc[50:51], v[210:211], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[50:51], v[210:211], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[156:159], acc[52:53], v[212:213], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[52:53], v[212:213], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " v[156:159], acc[54:55], v[214:215], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[54:55], v[214:215], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[248:251], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_6] \n" " ds_read_b128 v[248:251], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_6] \n"
_UK_MFMA_ " v[156:159], acc[56:57], v[216:217], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[56:57], v[216:217], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " v[156:159], acc[58:59], v[218:219], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[58:59], v[218:219], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[156:159], acc[60:61], v[220:221], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[60:61], v[220:221], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " v[156:159], acc[62:63], v[222:223], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[62:63], v[222:223], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[252:255], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_7] \n" " ds_read_b128 v[252:255], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_7] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[160:163], acc[64:65], v[192:193], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[64:65], v[192:193], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[66:67], v[194:195], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[66:67], v[194:195], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[24:27], 0 offen \n"
_UK_MFMA_ " v[160:163], acc[68:69], v[196:197], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[68:69], v[196:197], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[70:71], v[198:199], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[70:71], v[198:199], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[72:73], v[200:201], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[72:73], v[200:201], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[74:75], v[202:203], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[74:75], v[202:203], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[160:163], acc[76:77], v[204:205], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[76:77], v[204:205], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[78:79], v[206:207], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[78:79], v[206:207], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[164:167], acc[64:65], v[208:209], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[64:65], v[208:209], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[66:67], v[210:211], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[66:67], v[210:211], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[164:167], acc[68:69], v[212:213], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[68:69], v[212:213], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[70:71], v[214:215], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[70:71], v[214:215], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[72:73], v[216:217], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[72:73], v[216:217], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[74:75], v[218:219], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[74:75], v[218:219], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[164:167], acc[76:77], v[220:221], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[76:77], v[220:221], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[78:79], v[222:223], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[78:79], v[222:223], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[168:171], acc[80:81], v[192:193], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[80:81], v[192:193], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[82:83], v[194:195], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[82:83], v[194:195], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[24:27], 0 offen \n"
_UK_MFMA_ " v[168:171], acc[84:85], v[196:197], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[84:85], v[196:197], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[86:87], v[198:199], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[86:87], v[198:199], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[88:89], v[200:201], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[88:89], v[200:201], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[90:91], v[202:203], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[90:91], v[202:203], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[168:171], acc[92:93], v[204:205], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[92:93], v[204:205], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[94:95], v[206:207], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[94:95], v[206:207], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[172:175], acc[80:81], v[208:209], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[80:81], v[208:209], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[82:83], v[210:211], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[82:83], v[210:211], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[172:175], acc[84:85], v[212:213], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[84:85], v[212:213], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[86:87], v[214:215], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[86:87], v[214:215], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[88:89], v[216:217], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[88:89], v[216:217], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[90:91], v[218:219], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[90:91], v[218:219], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[172:175], acc[92:93], v[220:221], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[92:93], v[220:221], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[94:95], v[222:223], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[94:95], v[222:223], [%[c44], %[c45], %[c46], %[c47]] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[176:179], acc[96:97], v[192:193], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[96:97], v[192:193], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[98:99], v[194:195], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[98:99], v[194:195], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[24:27], 0 offen \n"
_UK_MFMA_ " v[176:179], acc[100:101], v[196:197], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[100:101], v[196:197], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[102:103], v[198:199], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[102:103], v[198:199], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[104:105], v[200:201], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[104:105], v[200:201], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[106:107], v[202:203], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[106:107], v[202:203], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[176:179], acc[108:109], v[204:205], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[108:109], v[204:205], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[110:111], v[206:207], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[110:111], v[206:207], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[180:183], acc[96:97], v[208:209], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[96:97], v[208:209], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[98:99], v[210:211], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[98:99], v[210:211], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[180:183], acc[100:101], v[212:213], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[100:101], v[212:213], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[102:103], v[214:215], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[102:103], v[214:215], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[104:105], v[216:217], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[104:105], v[216:217], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[106:107], v[218:219], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[106:107], v[218:219], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[180:183], acc[108:109], v[220:221], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[108:109], v[220:221], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[110:111], v[222:223], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[110:111], v[222:223], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[184:187], acc[112:113], v[192:193], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[112:113], v[192:193], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[114:115], v[194:195], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[114:115], v[194:195], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[24:27], 0 offen \n"
_UK_MFMA_ " v[184:187], acc[116:117], v[196:197], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[116:117], v[196:197], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[118:119], v[198:199], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[118:119], v[198:199], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[120:121], v[200:201], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[120:121], v[200:201], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[122:123], v[202:203], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[122:123], v[202:203], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[184:187], acc[124:125], v[204:205], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[124:125], v[204:205], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[126:127], v[206:207], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[126:127], v[206:207], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[188:191], acc[112:113], v[208:209], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[112:113], v[208:209], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[114:115], v[210:211], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[114:115], v[210:211], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[188:191], acc[116:117], v[212:213], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[116:117], v[212:213], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[118:119], v[214:215], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[118:119], v[214:215], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[120:121], v[216:217], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[120:121], v[216:217], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[122:123], v[218:219], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[122:123], v[218:219], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[188:191], acc[124:125], v[220:221], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[124:125], v[220:221], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[126:127], v[222:223], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[126:127], v[222:223], [%[c60], %[c61], %[c62], %[c63]] \n"
" s_add_u32 s60, 0x00000300, s80 \n" " s_add_u32 s60, 0x00000300, s80 \n"
" s_cmp_lt_u32 s60, %[s_loop_cnt] \n" " s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
" s_cselect_b32 s57, s57, 0 \n" " s_cselect_b32 s57, s57, 0 \n"
...@@ -391,193 +391,193 @@ ...@@ -391,193 +391,193 @@
" s_cbranch_scc0 label_end \n" " s_cbranch_scc0 label_end \n"
" s_waitcnt vmcnt(24) & lgkmcnt(0) \n" " s_waitcnt vmcnt(24) & lgkmcnt(0) \n"
" s_barrier \n" " s_barrier \n"
_UK_MFMA_ " v[128:131], acc[128:129], v[224:225], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[128:129], v[224:225], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " v[128:131], acc[130:131], v[226:227], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[130:131], v[226:227], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n"
_UK_MFMA_ " v[128:131], acc[132:133], v[228:229], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[132:133], v[228:229], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " v[128:131], acc[134:135], v[230:231], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[134:135], v[230:231], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[128:131], acc[136:137], v[232:233], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[136:137], v[232:233], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " v[128:131], acc[138:139], v[234:235], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[138:139], v[234:235], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[128:131], acc[140:141], v[236:237], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[140:141], v[236:237], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " v[128:131], acc[142:143], v[238:239], v[128:131] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[142:143], v[238:239], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[132:135], acc[128:129], v[240:241], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[128:129], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " v[132:135], acc[130:131], v[242:243], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[130:131], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[132:135], acc[132:133], v[244:245], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[132:133], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " v[132:135], acc[134:135], v[246:247], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[134:135], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[132:135], acc[136:137], v[248:249], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[136:137], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " v[132:135], acc[138:139], v[250:251], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[138:139], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[132:135], acc[140:141], v[252:253], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[140:141], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " v[132:135], acc[142:143], v[254:255], v[132:135] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[142:143], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[136:139], acc[144:145], v[224:225], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[144:145], v[224:225], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " v[136:139], acc[146:147], v[226:227], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[146:147], v[226:227], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen \n"
_UK_MFMA_ " v[136:139], acc[148:149], v[228:229], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[148:149], v[228:229], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " v[136:139], acc[150:151], v[230:231], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[150:151], v[230:231], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[136:139], acc[152:153], v[232:233], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[152:153], v[232:233], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " v[136:139], acc[154:155], v[234:235], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[154:155], v[234:235], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[136:139], acc[156:157], v[236:237], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[156:157], v[236:237], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " v[136:139], acc[158:159], v[238:239], v[136:139] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[158:159], v[238:239], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[140:143], acc[144:145], v[240:241], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[144:145], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " v[140:143], acc[146:147], v[242:243], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[146:147], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[140:143], acc[148:149], v[244:245], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[148:149], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " v[140:143], acc[150:151], v[246:247], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[150:151], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[140:143], acc[152:153], v[248:249], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[152:153], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " v[140:143], acc[154:155], v[250:251], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[154:155], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[140:143], acc[156:157], v[252:253], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[156:157], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " v[140:143], acc[158:159], v[254:255], v[140:143] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[158:159], v[254:255], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, 0, %[s_m0_init] \n" " s_add_u32 m0, 0, %[s_m0_init] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[144:147], acc[160:161], v[224:225], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[224:225], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " v[144:147], acc[162:163], v[226:227], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[226:227], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n"
_UK_MFMA_ " v[144:147], acc[164:165], v[228:229], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[228:229], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " v[144:147], acc[166:167], v[230:231], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[230:231], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0] \n" " ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0] \n"
_UK_MFMA_ " v[144:147], acc[168:169], v[232:233], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[232:233], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " v[144:147], acc[170:171], v[234:235], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[234:235], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[144:147], acc[172:173], v[236:237], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[236:237], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " v[144:147], acc[174:175], v[238:239], v[144:147] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[238:239], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1] \n" " ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1] \n"
_UK_MFMA_ " v[148:151], acc[160:161], v[240:241], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[240:241], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " v[148:151], acc[162:163], v[242:243], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[242:243], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[148:151], acc[164:165], v[244:245], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[244:245], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " v[148:151], acc[166:167], v[246:247], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[246:247], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2] \n" " ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2] \n"
_UK_MFMA_ " v[148:151], acc[168:169], v[248:249], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[248:249], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " v[148:151], acc[170:171], v[250:251], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[250:251], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[148:151], acc[172:173], v[252:253], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[252:253], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " v[148:151], acc[174:175], v[254:255], v[148:151] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[254:255], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3] \n" " ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3] \n"
_UK_MFMA_ " v[152:155], acc[176:177], v[224:225], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[224:225], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " v[152:155], acc[178:179], v[226:227], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[226:227], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n"
_UK_MFMA_ " v[152:155], acc[180:181], v[228:229], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[228:229], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " v[152:155], acc[182:183], v[230:231], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[230:231], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4] \n" " ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4] \n"
_UK_MFMA_ " v[152:155], acc[184:185], v[232:233], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[232:233], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " v[152:155], acc[186:187], v[234:235], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[234:235], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[152:155], acc[188:189], v[236:237], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[236:237], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " v[152:155], acc[190:191], v[238:239], v[152:155] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[238:239], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5] \n" " ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5] \n"
_UK_MFMA_ " v[156:159], acc[176:177], v[240:241], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[240:241], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " v[156:159], acc[178:179], v[242:243], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[242:243], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[156:159], acc[180:181], v[244:245], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[244:245], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " v[156:159], acc[182:183], v[246:247], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[246:247], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6] \n" " ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6] \n"
_UK_MFMA_ " v[156:159], acc[184:185], v[248:249], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[248:249], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " v[156:159], acc[186:187], v[250:251], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[250:251], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[156:159], acc[188:189], v[252:253], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[252:253], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " v[156:159], acc[190:191], v[254:255], v[156:159] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[254:255], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7] \n" " ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[160:163], acc[192:193], v[224:225], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[192:193], v[224:225], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[194:195], v[226:227], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[194:195], v[226:227], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen \n"
_UK_MFMA_ " v[160:163], acc[196:197], v[228:229], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[196:197], v[228:229], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[198:199], v[230:231], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[198:199], v[230:231], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[200:201], v[232:233], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[200:201], v[232:233], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[202:203], v[234:235], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[202:203], v[234:235], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[160:163], acc[204:205], v[236:237], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[204:205], v[236:237], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[160:163], acc[206:207], v[238:239], v[160:163] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[206:207], v[238:239], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " v[164:167], acc[192:193], v[240:241], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[192:193], v[240:241], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[194:195], v[242:243], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[194:195], v[242:243], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[164:167], acc[196:197], v[244:245], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[196:197], v[244:245], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[198:199], v[246:247], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[198:199], v[246:247], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[200:201], v[248:249], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[200:201], v[248:249], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[202:203], v[250:251], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[202:203], v[250:251], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[164:167], acc[204:205], v[252:253], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[204:205], v[252:253], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[164:167], acc[206:207], v[254:255], v[164:167] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[206:207], v[254:255], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " v[168:171], acc[208:209], v[224:225], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[208:209], v[224:225], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[210:211], v[226:227], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[210:211], v[226:227], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen \n"
_UK_MFMA_ " v[168:171], acc[212:213], v[228:229], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[212:213], v[228:229], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[214:215], v[230:231], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[214:215], v[230:231], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[216:217], v[232:233], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[216:217], v[232:233], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[218:219], v[234:235], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[218:219], v[234:235], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[168:171], acc[220:221], v[236:237], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[220:221], v[236:237], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[168:171], acc[222:223], v[238:239], v[168:171] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[222:223], v[238:239], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " v[172:175], acc[208:209], v[240:241], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[208:209], v[240:241], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[210:211], v[242:243], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[210:211], v[242:243], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[172:175], acc[212:213], v[244:245], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[212:213], v[244:245], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[214:215], v[246:247], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[214:215], v[246:247], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[216:217], v[248:249], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[216:217], v[248:249], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[218:219], v[250:251], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[218:219], v[250:251], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[172:175], acc[220:221], v[252:253], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[220:221], v[252:253], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " v[172:175], acc[222:223], v[254:255], v[172:175] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[222:223], v[254:255], [%[c44], %[c45], %[c46], %[c47]] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[176:179], acc[224:225], v[224:225], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[224:225], v[224:225], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[226:227], v[226:227], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[226:227], v[226:227], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen \n"
_UK_MFMA_ " v[176:179], acc[228:229], v[228:229], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[228:229], v[228:229], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[230:231], v[230:231], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[230:231], v[230:231], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[232:233], v[232:233], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[232:233], v[232:233], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[234:235], v[234:235], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[234:235], v[234:235], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[176:179], acc[236:237], v[236:237], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[236:237], v[236:237], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[176:179], acc[238:239], v[238:239], v[176:179] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[238:239], v[238:239], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " v[180:183], acc[224:225], v[240:241], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[224:225], v[240:241], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[226:227], v[242:243], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[226:227], v[242:243], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[180:183], acc[228:229], v[244:245], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[228:229], v[244:245], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[230:231], v[246:247], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[230:231], v[246:247], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[232:233], v[248:249], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[232:233], v[248:249], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[234:235], v[250:251], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[234:235], v[250:251], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[180:183], acc[236:237], v[252:253], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[236:237], v[252:253], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[180:183], acc[238:239], v[254:255], v[180:183] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[238:239], v[254:255], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " v[184:187], acc[240:241], v[224:225], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[240:241], v[224:225], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[242:243], v[226:227], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[242:243], v[226:227], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen \n"
_UK_MFMA_ " v[184:187], acc[244:245], v[228:229], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[244:245], v[228:229], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[246:247], v[230:231], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[246:247], v[230:231], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[248:249], v[232:233], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[248:249], v[232:233], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[250:251], v[234:235], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[250:251], v[234:235], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[184:187], acc[252:253], v[236:237], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[252:253], v[236:237], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[184:187], acc[254:255], v[238:239], v[184:187] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[254:255], v[238:239], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " v[188:191], acc[240:241], v[240:241], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[240:241], v[240:241], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[242:243], v[242:243], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[242:243], v[242:243], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[188:191], acc[244:245], v[244:245], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[244:245], v[244:245], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[246:247], v[246:247], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[246:247], v[246:247], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[248:249], v[248:249], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[248:249], v[248:249], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[250:251], v[250:251], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[250:251], v[250:251], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[188:191], acc[252:253], v[252:253], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[252:253], v[252:253], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " v[188:191], acc[254:255], v[254:255], v[188:191] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[254:255], v[254:255], [%[c60], %[c61], %[c62], %[c63]] \n"
" s_add_u32 s60, 0x00000300, s80 \n" " s_add_u32 s60, 0x00000300, s80 \n"
" s_cmp_lt_u32 s60, %[s_loop_cnt] \n" " s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
" s_cselect_b32 s57, s57, 0 \n" " s_cselect_b32 s57, s57, 0 \n"
...@@ -790,3 +790,4 @@ ...@@ -790,3 +790,4 @@
#undef _DEQUAN_CVT_ #undef _DEQUAN_CVT_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment