Merge branch 'develop' into ck_tile_gemmkernel_reuse

ba8b5c58 · Aleksander Dudek · 8bba35f2 · d9e37c68 · ba8b5c58 · ba8b5c58
Commit ba8b5c58 authored Dec 17, 2024 by Aleksander Dudek
15 changed files
--- a/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
+++ b/include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc
@@ -9,508 +9,509 @@
 #endif

 "s_mov_b32 s16,    %[s_res_a0] \n"
-"s_mov_b32 s17,    %[s_res_a1] \n"
-"s_mov_b32 s18,    %[s_res_a2] \n"
-"s_mov_b32 s19,    %[s_res_a3] \n"
-"s_mov_b32 s20,    %[s_res_b0] \n"
-"s_mov_b32 s21,    %[s_res_b1] \n"
-"s_mov_b32 s22,    %[s_res_b2] \n"
-"s_mov_b32 s23,    %[s_res_b3] \n"
-// "s_nop  4\n"
-"; -- prefetch A0\n"
-"s_add_u32     m0, 0, %[s_m0_init]                        \n"
-"buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                \n"
-"buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
-"s_add_u32 m0, %[smem_sz], %[s_m0_init]                       \n"
-"s_cmp_gt_i32  %[s_loop_cnt] 1             ; move a with cond \n"
-"s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
-"s_add_u32     s16, s86, s16               ; move a with cond \n"
-"s_addc_u32    s17, 0, s17                 ; move a with cond \n"
-"; -- prefetch A1\n"
-"buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
-"s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-"buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
-"s_add_u32 m0, 0, %[s_m0_init]                                \n"
-"s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
-"s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
-"s_add_u32     s16, s86, s16               ; move a with cond \n"
-"s_addc_u32    s17, 0, s17                 ; move a with cond \n"
-"; -- prefetch B0\n"
-"buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
-"buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072  \n"
-"buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen  \n"
-"buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072  \n"
-"buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen  \n"
-"buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072  \n"
-"buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen  \n"
-"buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024    \n"
-"buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048    \n"
-"buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072    \n"
-"buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen                \n"
-"buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024    \n"
-"buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048    \n"
-"buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072    \n"
-"buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen                \n"
-"buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024    \n"
-"buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048    \n"
-"buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072    \n"
-"buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen                \n"
-"buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072  \n"
-"buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen              \n"
-"buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024  \n"
-"buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048  \n"
-"buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072  \n"
-"s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
-"s_cselect_b32 s86, %[s_tile_os_b], 0      ; move b with cond \n"
-"s_add_u32     s20, s86, s20               ; move b with cond \n"
-"s_addc_u32    s21, 0, s21                 ; move b with cond \n"
-"s_waitcnt     vmcnt(40)                        \n"
-"s_barrier                                      \n"
-"ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n"    // 1024: N stride, 64 K stride
-"ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n"
-"ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n"
-"ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n"
-"ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n"
-"ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n"
-"ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n"
-"ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n"
-"L_start%=:                                                         \n"
-"  s_waitcnt     vmcnt(24) & lgkmcnt(0)                             \n"
-"  s_barrier                                                        \n"
-_UK_MFMA_ "  %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0] \n"
-"  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0] \n"
-"  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0] \n"
-"  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0] \n"
-"  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1] \n"
-"  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1] \n"
-"  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1] \n"
-"  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1] \n"
-"  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2] \n"
-"  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2] \n"
-"  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2] \n"
-"  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2] \n"
-"  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3] \n"
-"  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3] \n"
-"  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3] \n"
-"  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3] \n"
-"  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[smem_sz], %[s_m0_init]                  \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4] \n"
-"  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4] \n"
-"  ds_read_b128  v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0]                \n"
-_UK_MFMA_ "  %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4] \n"
-"  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4] \n"
-"  ds_read_b128  v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1]              \n"
-_UK_MFMA_ "  %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5] \n"
-"  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5] \n"
-"  ds_read_b128  v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2]              \n"
-_UK_MFMA_ "  %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5] \n"
-"  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5] \n"
-"  ds_read_b128  v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3]              \n"
-_UK_MFMA_ "  %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6] \n"
-"  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6] \n"
-"  ds_read_b128  v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4]              \n"
-_UK_MFMA_ "  %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6] \n"
-"  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6] \n"
-"  ds_read_b128  v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5]              \n"
-_UK_MFMA_ "  %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7] \n"
-"  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7] \n"
-"  ds_read_b128  v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6]              \n"
-_UK_MFMA_ "  %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7] \n"
-"  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7] \n"
-"  ds_read_b128  v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]              \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8] \n"
-"  buffer_load_dwordx4  acc[192:195], %[v_os_b4], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8] \n"
-"  buffer_load_dwordx4  acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9] \n"
-"  buffer_load_dwordx4  acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9] \n"
-"  buffer_load_dwordx4  acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10] \n"
-"  buffer_load_dwordx4  acc[208:211], %[v_os_b5], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10] \n"
-"  buffer_load_dwordx4  acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11] \n"
-"  buffer_load_dwordx4  acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11] \n"
-"  buffer_load_dwordx4  acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12] \n"
-"  buffer_load_dwordx4  acc[224:227], %[v_os_b6], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12] \n"
-"  buffer_load_dwordx4  acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13] \n"
-"  buffer_load_dwordx4  acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13] \n"
-"  buffer_load_dwordx4  acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14] \n"
-"  buffer_load_dwordx4  acc[240:243], %[v_os_b7], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14] \n"
-"  buffer_load_dwordx4  acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15] \n"
-"  buffer_load_dwordx4  acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15] \n"
-"  buffer_load_dwordx4  acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072\n"
-_UK_MFMA_ "  %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15] \n"
-"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
-"  s_cbranch_scc0 L_end%=                                       \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
-"  s_add_u32     s16, s86, s16                                  \n"
-"  s_addc_u32    s17, 0, s17                                    \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
-"  s_add_u32     s20, s86, s20                                  \n"
-"  s_addc_u32    s21, 0, s21                                    \n"
-"  ;------------------------------------------                  \n"
-"  s_waitcnt     vmcnt(24) & lgkmcnt(0)                  \n"
-"  s_barrier                                             \n"
-_UK_MFMA_ "  %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0] \n"
-"  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0] \n"
-"  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0] \n"
-"  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0] \n"
-_UK_MFMA_ "  %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0] \n"
-"  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1] \n"
-"  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1] \n"
-"  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1] \n"
-"  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1] \n"
-_UK_MFMA_ "  %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1] \n"
-"  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2] \n"
-"  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2] \n"
-"  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2] \n"
-"  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2] \n"
-_UK_MFMA_ "  %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2] \n"
-"  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3] \n"
-"  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3] \n"
-"  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, %[s_size_per_issue], m0                  \n"
-_UK_MFMA_ "  %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3] \n"
-"  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3] \n"
-_UK_MFMA_ "  %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3] \n"
-"  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
-"  s_add_u32     m0, 0, %[s_m0_init]                  \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4] \n"
-"  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4] \n"
-"  ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]  \n"
-_UK_MFMA_ "  %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4] \n"
-"  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4] \n"
-_UK_MFMA_ "  %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4] \n"
-"  ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]  \n"
-_UK_MFMA_ "  %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5] \n"
-"  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5] \n"
-"  ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]                 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5] \n"
-"  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5] \n"
-_UK_MFMA_ "  %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5] \n"
-"  ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]                \n"
-_UK_MFMA_ "  %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6] \n"
-"  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6] \n"
-"  ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]               \n"
-_UK_MFMA_ "  %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6] \n"
-"  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6] \n"
-_UK_MFMA_ "  %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6] \n"
-"  ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]            \n"
-_UK_MFMA_ "  %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7] \n"
-"  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7] \n"
-"  ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]                \n"
-_UK_MFMA_ "  %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7] \n"
-"  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7] \n"
-_UK_MFMA_ "  %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7] \n"
-"  ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]           \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8] \n"
-"  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8] \n"
-"  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9] \n"
-"  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9] \n"
-"  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10] \n"
-"  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10] \n"
-"  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11] \n"
-"  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11] \n"
-"  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11] \n"
-_UK_MFMA_ "  %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11] \n"
-"  s_waitcnt     vmcnt(32)                               \n"
-_UK_MFMA_ "  %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12] \n"
-"  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12] \n"
-"  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13] \n"
-"  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13] \n"
-"  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14] \n"
-"  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen \n"
-_UK_MFMA_ "  %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14] \n"
-"  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n"
-_UK_MFMA_ "  %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15] \n"
-"  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n"
-_UK_MFMA_ "  %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15] \n"
-"  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n"
-_UK_MFMA_ "  %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15] \n"
-_UK_MFMA_ "  %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15] \n"
-"  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
-"  s_cbranch_scc0 L_end%=                                       \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
-"  s_add_u32     s16, s86, s16                                  \n"
-"  s_addc_u32    s17, 0, s17                                    \n"
-"  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
-"  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
-"  s_add_u32     s20, s86, s20                                  \n"
-"  s_addc_u32    s21, 0, s21                                    \n"
-"  s_branch     L_start%=                                       \n"
-"L_end%=:                                                       \n"
-"  s_nop 2                                                      \n"
+    "s_mov_b32 s17,    %[s_res_a1] \n"
+    "s_mov_b32 s18,    %[s_res_a2] \n"
+    "s_mov_b32 s19,    %[s_res_a3] \n"
+    "s_mov_b32 s20,    %[s_res_b0] \n"
+    "s_mov_b32 s21,    %[s_res_b1] \n"
+    "s_mov_b32 s22,    %[s_res_b2] \n"
+    "s_mov_b32 s23,    %[s_res_b3] \n"
+    // "s_nop  4\n"
+    "; -- prefetch A0\n"
+    "s_add_u32     m0, 0, %[s_m0_init]                        \n"
+    "buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                \n"
+    "buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
+    "s_add_u32 m0, %[smem_sz], %[s_m0_init]                       \n"
+    "s_cmp_gt_i32  %[s_loop_cnt] 1             ; move a with cond \n"
+    "s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
+    "s_add_u32     s16, s86, s16               ; move a with cond \n"
+    "s_addc_u32    s17, 0, s17                 ; move a with cond \n"
+    "; -- prefetch A1\n"
+    "buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds    \n"
+    "s_add_u32     m0, %[s_size_per_issue], m0                  \n"
+    "buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds    \n"
+    "s_add_u32 m0, 0, %[s_m0_init]                                \n"
+    "s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
+    "s_cselect_b32 s86, %[s_tile_os_a], 0      ; move a with cond  \n"
+    "s_add_u32     s16, s86, s16               ; move a with cond \n"
+    "s_addc_u32    s17, 0, s17                 ; move a with cond \n"
+    "; -- prefetch B0\n"
+    "buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
+    "buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072  \n"
+    "buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen  \n"
+    "buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072  \n"
+    "buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen  \n"
+    "buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072  \n"
+    "buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen  \n"
+    "buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024    \n"
+    "buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048    \n"
+    "buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072    \n"
+    "buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen                \n"
+    "buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024    \n"
+    "buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048    \n"
+    "buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072    \n"
+    "buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen                \n"
+    "buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024    \n"
+    "buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048    \n"
+    "buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072    \n"
+    "buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen                \n"
+    "buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072  \n"
+    "buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen              \n"
+    "buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024  \n"
+    "buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048  \n"
+    "buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072  \n"
+    "s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+    "s_cselect_b32 s86, %[s_tile_os_b], 0      ; move b with cond \n"
+    "s_add_u32     s20, s86, s20               ; move b with cond \n"
+    "s_addc_u32    s21, 0, s21                 ; move b with cond \n"
+    "s_waitcnt     vmcnt(40)                        \n"
+    "s_barrier                                      \n"
+    "ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n" // 1024: N stride, 64
+                                                                               // K stride
+    "ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n"
+    "ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n"
+    "ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n"
+    "ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n"
+    "ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n"
+    "ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n"
+    "ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n"
+    "L_start%=:                                                         \n"
+    "  s_waitcnt     vmcnt(24) & lgkmcnt(0)                             \n"
+    "  s_barrier                                                        \n" _UK_MFMA_
+    "  %[v_acc_0], acc[0:1], v[64:65], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[2:3], v[66:67], %[v_acc_0] \n"
+    "  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_0], acc[4:5], v[68:69], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[6:7], v[70:71], %[v_acc_0] \n"
+    "  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_0], acc[8:9], v[72:73], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[10:11], v[74:75], %[v_acc_0] \n"
+    "  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_0], acc[12:13], v[76:77], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[14:15], v[78:79], %[v_acc_0] \n"
+    "  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_1], acc[0:1], v[80:81], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[2:3], v[82:83], %[v_acc_1] \n"
+    "  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_1], acc[4:5], v[84:85], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[6:7], v[86:87], %[v_acc_1] \n"
+    "  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_1], acc[8:9], v[88:89], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[10:11], v[90:91], %[v_acc_1] \n"
+    "  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_1], acc[12:13], v[92:93], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[14:15], v[94:95], %[v_acc_1] \n"
+    "  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_2], acc[16:17], v[64:65], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[18:19], v[66:67], %[v_acc_2] \n"
+    "  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_2], acc[20:21], v[68:69], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[22:23], v[70:71], %[v_acc_2] \n"
+    "  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_2], acc[24:25], v[72:73], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[26:27], v[74:75], %[v_acc_2] \n"
+    "  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_2], acc[28:29], v[76:77], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[30:31], v[78:79], %[v_acc_2] \n"
+    "  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_3], acc[16:17], v[80:81], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[18:19], v[82:83], %[v_acc_3] \n"
+    "  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_3], acc[20:21], v[84:85], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[22:23], v[86:87], %[v_acc_3] \n"
+    "  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_3], acc[24:25], v[88:89], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[26:27], v[90:91], %[v_acc_3] \n"
+    "  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_3], acc[28:29], v[92:93], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[30:31], v[94:95], %[v_acc_3] \n"
+    "  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[smem_sz], %[s_m0_init]                  \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_4], acc[32:33], v[64:65], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[34:35], v[66:67], %[v_acc_4] \n"
+    "  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_4], acc[36:37], v[68:69], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[38:39], v[70:71], %[v_acc_4] \n"
+    "  ds_read_b128  v[96:99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0]                "
+    "\n" _UK_MFMA_ "  %[v_acc_4], acc[40:41], v[72:73], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[42:43], v[74:75], %[v_acc_4] \n"
+    "  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_4], acc[44:45], v[76:77], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[46:47], v[78:79], %[v_acc_4] \n"
+    "  ds_read_b128  v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1]              "
+    "\n" _UK_MFMA_ "  %[v_acc_5], acc[32:33], v[80:81], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[34:35], v[82:83], %[v_acc_5] \n"
+    "  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_5], acc[36:37], v[84:85], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[38:39], v[86:87], %[v_acc_5] \n"
+    "  ds_read_b128  v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2]              "
+    "\n" _UK_MFMA_ "  %[v_acc_5], acc[40:41], v[88:89], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[42:43], v[90:91], %[v_acc_5] \n"
+    "  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_5], acc[44:45], v[92:93], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[46:47], v[94:95], %[v_acc_5] \n"
+    "  ds_read_b128  v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3]              "
+    "\n" _UK_MFMA_ "  %[v_acc_6], acc[48:49], v[64:65], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[50:51], v[66:67], %[v_acc_6] \n"
+    "  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_6], acc[52:53], v[68:69], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[54:55], v[70:71], %[v_acc_6] \n"
+    "  ds_read_b128  v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4]              "
+    "\n" _UK_MFMA_ "  %[v_acc_6], acc[56:57], v[72:73], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[58:59], v[74:75], %[v_acc_6] \n"
+    "  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_6], acc[60:61], v[76:77], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[62:63], v[78:79], %[v_acc_6] \n"
+    "  ds_read_b128  v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5]              "
+    "\n" _UK_MFMA_ "  %[v_acc_7], acc[48:49], v[80:81], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[50:51], v[82:83], %[v_acc_7] \n"
+    "  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_7], acc[52:53], v[84:85], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[54:55], v[86:87], %[v_acc_7] \n"
+    "  ds_read_b128  v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6]              "
+    "\n" _UK_MFMA_ "  %[v_acc_7], acc[56:57], v[88:89], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[58:59], v[90:91], %[v_acc_7] \n"
+    "  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_7], acc[60:61], v[92:93], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[62:63], v[94:95], %[v_acc_7] \n"
+    "  ds_read_b128  v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]              \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_8], acc[64:65], v[64:65], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[66:67], v[66:67], %[v_acc_8] \n"
+    "  buffer_load_dwordx4  acc[192:195], %[v_os_b4], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_8], acc[68:69], v[68:69], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[70:71], v[70:71], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[72:73], v[72:73], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[74:75], v[74:75], %[v_acc_8] \n"
+    "  buffer_load_dwordx4  acc[196:199], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_8], acc[76:77], v[76:77], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[78:79], v[78:79], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[64:65], v[80:81], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[66:67], v[82:83], %[v_acc_9] \n"
+    "  buffer_load_dwordx4  acc[200:203], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_9], acc[68:69], v[84:85], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[70:71], v[86:87], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[72:73], v[88:89], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[74:75], v[90:91], %[v_acc_9] \n"
+    "  buffer_load_dwordx4  acc[204:207], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_9], acc[76:77], v[92:93], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[78:79], v[94:95], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[80:81], v[64:65], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[82:83], v[66:67], %[v_acc_10] \n"
+    "  buffer_load_dwordx4  acc[208:211], %[v_os_b5], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_10], acc[84:85], v[68:69], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[86:87], v[70:71], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[88:89], v[72:73], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[90:91], v[74:75], %[v_acc_10] \n"
+    "  buffer_load_dwordx4  acc[212:215], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_10], acc[92:93], v[76:77], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[94:95], v[78:79], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[80:81], v[80:81], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[82:83], v[82:83], %[v_acc_11] \n"
+    "  buffer_load_dwordx4  acc[216:219], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_11], acc[84:85], v[84:85], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[86:87], v[86:87], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[88:89], v[88:89], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[90:91], v[90:91], %[v_acc_11] \n"
+    "  buffer_load_dwordx4  acc[220:223], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_11], acc[92:93], v[92:93], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[94:95], v[94:95], %[v_acc_11] \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_12], acc[96:97], v[64:65], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[98:99], v[66:67], %[v_acc_12] \n"
+    "  buffer_load_dwordx4  acc[224:227], %[v_os_b6], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_12], acc[100:101], v[68:69], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[102:103], v[70:71], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[104:105], v[72:73], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[106:107], v[74:75], %[v_acc_12] \n"
+    "  buffer_load_dwordx4  acc[228:231], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_12], acc[108:109], v[76:77], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[110:111], v[78:79], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[96:97], v[80:81], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[98:99], v[82:83], %[v_acc_13] \n"
+    "  buffer_load_dwordx4  acc[232:235], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_13], acc[100:101], v[84:85], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[102:103], v[86:87], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[104:105], v[88:89], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[106:107], v[90:91], %[v_acc_13] \n"
+    "  buffer_load_dwordx4  acc[236:239], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_13], acc[108:109], v[92:93], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[110:111], v[94:95], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[112:113], v[64:65], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[114:115], v[66:67], %[v_acc_14] \n"
+    "  buffer_load_dwordx4  acc[240:243], %[v_os_b7], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_14], acc[116:117], v[68:69], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[118:119], v[70:71], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[120:121], v[72:73], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[122:123], v[74:75], %[v_acc_14] \n"
+    "  buffer_load_dwordx4  acc[244:247], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_14], acc[124:125], v[76:77], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[126:127], v[78:79], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[112:113], v[80:81], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[114:115], v[82:83], %[v_acc_15] \n"
+    "  buffer_load_dwordx4  acc[248:251], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_15], acc[116:117], v[84:85], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[118:119], v[86:87], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[120:121], v[88:89], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[122:123], v[90:91], %[v_acc_15] \n"
+    "  buffer_load_dwordx4  acc[252:255], %[v_os_b7], s[20:23], 0 offen offset:3072\n" _UK_MFMA_
+    "  %[v_acc_15], acc[124:125], v[92:93], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[126:127], v[94:95], %[v_acc_15] \n"
+    "  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+    "  s_cbranch_scc0 L_end%=                                       \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
+    "  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
+    "  s_add_u32     s16, s86, s16                                  \n"
+    "  s_addc_u32    s17, 0, s17                                    \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+    "  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+    "  s_add_u32     s20, s86, s20                                  \n"
+    "  s_addc_u32    s21, 0, s21                                    \n"
+    "  ;------------------------------------------                  \n"
+    "  s_waitcnt     vmcnt(24) & lgkmcnt(0)                  \n"
+    "  s_barrier                                             \n" _UK_MFMA_
+    "  %[v_acc_0], acc[128:129], v[96:97], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[130:131], v[98:99], %[v_acc_0] \n"
+    "  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_0], acc[132:133], v[100:101], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[134:135], v[102:103], %[v_acc_0] \n"
+    "  buffer_load_dword   %[v_os_a0], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_0], acc[136:137], v[104:105], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[138:139], v[106:107], %[v_acc_0] \n"
+    "  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_0], acc[140:141], v[108:109], %[v_acc_0] \n" _UK_MFMA_
+    "  %[v_acc_0], acc[142:143], v[110:111], %[v_acc_0] \n"
+    "  buffer_load_dword   %[v_os_a1], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_1], acc[128:129], v[112:113], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[130:131], v[114:115], %[v_acc_1] \n"
+    "  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_1], acc[132:133], v[116:117], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[134:135], v[118:119], %[v_acc_1] \n"
+    "  buffer_load_dword   %[v_os_a2], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_1], acc[136:137], v[120:121], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[138:139], v[122:123], %[v_acc_1] \n"
+    "  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_1], acc[140:141], v[124:125], %[v_acc_1] \n" _UK_MFMA_
+    "  %[v_acc_1], acc[142:143], v[126:127], %[v_acc_1] \n"
+    "  buffer_load_dword   %[v_os_a3], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_2], acc[144:145], v[96:97], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[146:147], v[98:99], %[v_acc_2] \n"
+    "  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_2], acc[148:149], v[100:101], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[150:151], v[102:103], %[v_acc_2] \n"
+    "  buffer_load_dword   %[v_os_a4], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_2], acc[152:153], v[104:105], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[154:155], v[106:107], %[v_acc_2] \n"
+    "  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_2], acc[156:157], v[108:109], %[v_acc_2] \n" _UK_MFMA_
+    "  %[v_acc_2], acc[158:159], v[110:111], %[v_acc_2] \n"
+    "  buffer_load_dword   %[v_os_a5], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_3], acc[144:145], v[112:113], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[146:147], v[114:115], %[v_acc_3] \n"
+    "  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_3], acc[148:149], v[116:117], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[150:151], v[118:119], %[v_acc_3] \n"
+    "  buffer_load_dword   %[v_os_a6], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, %[s_size_per_issue], m0                  \n" _UK_MFMA_
+    "  %[v_acc_3], acc[152:153], v[120:121], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[154:155], v[122:123], %[v_acc_3] \n"
+    "  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_3], acc[156:157], v[124:125], %[v_acc_3] \n" _UK_MFMA_
+    "  %[v_acc_3], acc[158:159], v[126:127], %[v_acc_3] \n"
+    "  buffer_load_dword   %[v_os_a7], s[16:19], 0 offen lds     \n"
+    "  s_add_u32     m0, 0, %[s_m0_init]                  \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_4], acc[160:161], v[96:97], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[162:163], v[98:99], %[v_acc_4] \n"
+    "  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_4], acc[164:165], v[100:101], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[166:167], v[102:103], %[v_acc_4] \n"
+    "  ds_read_b128  v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]  \n" _UK_MFMA_
+    "  %[v_acc_4], acc[168:169], v[104:105], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[170:171], v[106:107], %[v_acc_4] \n"
+    "  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_4], acc[172:173], v[108:109], %[v_acc_4] \n" _UK_MFMA_
+    "  %[v_acc_4], acc[174:175], v[110:111], %[v_acc_4] \n"
+    "  ds_read_b128  v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]  \n" _UK_MFMA_
+    "  %[v_acc_5], acc[160:161], v[112:113], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[162:163], v[114:115], %[v_acc_5] \n"
+    "  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_5], acc[164:165], v[116:117], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[166:167], v[118:119], %[v_acc_5] \n"
+    "  ds_read_b128  v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]                 "
+    "\n" _UK_MFMA_ "  %[v_acc_5], acc[168:169], v[120:121], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[170:171], v[122:123], %[v_acc_5] \n"
+    "  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_5], acc[172:173], v[124:125], %[v_acc_5] \n" _UK_MFMA_
+    "  %[v_acc_5], acc[174:175], v[126:127], %[v_acc_5] \n"
+    "  ds_read_b128  v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]                "
+    "\n" _UK_MFMA_ "  %[v_acc_6], acc[176:177], v[96:97], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[178:179], v[98:99], %[v_acc_6] \n"
+    "  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_6], acc[180:181], v[100:101], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[182:183], v[102:103], %[v_acc_6] \n"
+    "  ds_read_b128  v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]               "
+    "\n" _UK_MFMA_ "  %[v_acc_6], acc[184:185], v[104:105], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[186:187], v[106:107], %[v_acc_6] \n"
+    "  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_6], acc[188:189], v[108:109], %[v_acc_6] \n" _UK_MFMA_
+    "  %[v_acc_6], acc[190:191], v[110:111], %[v_acc_6] \n"
+    "  ds_read_b128  v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]            "
+    "\n" _UK_MFMA_ "  %[v_acc_7], acc[176:177], v[112:113], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[178:179], v[114:115], %[v_acc_7] \n"
+    "  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_7], acc[180:181], v[116:117], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[182:183], v[118:119], %[v_acc_7] \n"
+    "  ds_read_b128  v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]                "
+    "\n" _UK_MFMA_ "  %[v_acc_7], acc[184:185], v[120:121], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[186:187], v[122:123], %[v_acc_7] \n"
+    "  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_7], acc[188:189], v[124:125], %[v_acc_7] \n" _UK_MFMA_
+    "  %[v_acc_7], acc[190:191], v[126:127], %[v_acc_7] \n"
+    "  ds_read_b128  v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]           \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_8], acc[192:193], v[96:97], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[194:195], v[98:99], %[v_acc_8] \n"
+    "  buffer_load_dwordx4  acc[64:67], %[v_os_b4], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_8], acc[196:197], v[100:101], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[198:199], v[102:103], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[200:201], v[104:105], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[202:203], v[106:107], %[v_acc_8] \n"
+    "  buffer_load_dwordx4  acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_8], acc[204:205], v[108:109], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_8], acc[206:207], v[110:111], %[v_acc_8] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[192:193], v[112:113], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[194:195], v[114:115], %[v_acc_9] \n"
+    "  buffer_load_dwordx4  acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_9], acc[196:197], v[116:117], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[198:199], v[118:119], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[200:201], v[120:121], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[202:203], v[122:123], %[v_acc_9] \n"
+    "  buffer_load_dwordx4  acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_9], acc[204:205], v[124:125], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_9], acc[206:207], v[126:127], %[v_acc_9] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[208:209], v[96:97], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[210:211], v[98:99], %[v_acc_10] \n"
+    "  buffer_load_dwordx4  acc[80:83], %[v_os_b5], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_10], acc[212:213], v[100:101], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[214:215], v[102:103], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[216:217], v[104:105], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[218:219], v[106:107], %[v_acc_10] \n"
+    "  buffer_load_dwordx4  acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_10], acc[220:221], v[108:109], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_10], acc[222:223], v[110:111], %[v_acc_10] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[208:209], v[112:113], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[210:211], v[114:115], %[v_acc_11] \n"
+    "  buffer_load_dwordx4  acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_11], acc[212:213], v[116:117], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[214:215], v[118:119], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[216:217], v[120:121], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[218:219], v[122:123], %[v_acc_11] \n"
+    "  buffer_load_dwordx4  acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_11], acc[220:221], v[124:125], %[v_acc_11] \n" _UK_MFMA_
+    "  %[v_acc_11], acc[222:223], v[126:127], %[v_acc_11] \n"
+    "  s_waitcnt     vmcnt(32)                               \n" _UK_MFMA_
+    "  %[v_acc_12], acc[224:225], v[96:97], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[226:227], v[98:99], %[v_acc_12] \n"
+    "  buffer_load_dwordx4  acc[96:99], %[v_os_b6], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_12], acc[228:229], v[100:101], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[230:231], v[102:103], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[232:233], v[104:105], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[234:235], v[106:107], %[v_acc_12] \n"
+    "  buffer_load_dwordx4  acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_12], acc[236:237], v[108:109], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_12], acc[238:239], v[110:111], %[v_acc_12] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[224:225], v[112:113], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[226:227], v[114:115], %[v_acc_13] \n"
+    "  buffer_load_dwordx4  acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_13], acc[228:229], v[116:117], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[230:231], v[118:119], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[232:233], v[120:121], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[234:235], v[122:123], %[v_acc_13] \n"
+    "  buffer_load_dwordx4  acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_13], acc[236:237], v[124:125], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_13], acc[238:239], v[126:127], %[v_acc_13] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[240:241], v[96:97], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[242:243], v[98:99], %[v_acc_14] \n"
+    "  buffer_load_dwordx4  acc[112:115], %[v_os_b7], s[20:23], 0 offen \n" _UK_MFMA_
+    "  %[v_acc_14], acc[244:245], v[100:101], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[246:247], v[102:103], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[248:249], v[104:105], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[250:251], v[106:107], %[v_acc_14] \n"
+    "  buffer_load_dwordx4  acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n" _UK_MFMA_
+    "  %[v_acc_14], acc[252:253], v[108:109], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_14], acc[254:255], v[110:111], %[v_acc_14] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[240:241], v[112:113], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[242:243], v[114:115], %[v_acc_15] \n"
+    "  buffer_load_dwordx4  acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n" _UK_MFMA_
+    "  %[v_acc_15], acc[244:245], v[116:117], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[246:247], v[118:119], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[248:249], v[120:121], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[250:251], v[122:123], %[v_acc_15] \n"
+    "  buffer_load_dwordx4  acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n" _UK_MFMA_
+    "  %[v_acc_15], acc[252:253], v[124:125], %[v_acc_15] \n" _UK_MFMA_
+    "  %[v_acc_15], acc[254:255], v[126:127], %[v_acc_15] \n"
+    "  s_sub_i32     %[s_loop_cnt], %[s_loop_cnt], 1                \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 0                                \n"
+    "  s_cbranch_scc0 L_end%=                                       \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 2             ; move a with cond \n"
+    "  s_cselect_b32 s86, %[s_tile_os_a], 0                          \n"
+    "  s_add_u32     s16, s86, s16                                  \n"
+    "  s_addc_u32    s17, 0, s17                                    \n"
+    "  s_cmp_gt_i32  %[s_loop_cnt] 1             ; move b with cond \n"
+    "  s_cselect_b32 s86, %[s_tile_os_b], 0                          \n"
+    "  s_add_u32     s20, s86, s20                                  \n"
+    "  s_addc_u32    s21, 0, s21                                    \n"
+    "  s_branch     L_start%=                                       \n"
+    "L_end%=:                                                       \n"
+    "  s_nop 2                                                      \n"

 #undef _UK_MFMA_
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
@@ -810,21 +810,46 @@ struct FusedMoeGemmPipelineFlatmmPolicy
    CK_TILE_HOST_DEVICE static constexpr auto GetUK_1()
    {
        using S_ = typename Problem::BlockShape;
+        using T_ = typename Problem::Traits;
        if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
                     std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
                     std::is_same_v<typename Problem::TopkWeightDataType, float> &&
                     S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
-                     S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+                     S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                     T_::PipeInterleave == false)
        {
            return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16{};
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl{};
        }
        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::fp16_t> &&
                          std::is_same_v<typename Problem::DDataType, ck_tile::fp16_t> &&
                          std::is_same_v<typename Problem::TopkWeightDataType, float> &&
                          S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
-                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32)
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                          T_::PipeInterleave == false)
        {
            return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::bf16_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::bf16_t> &&
+                          std::is_same_v<typename Problem::TopkWeightDataType, float> &&
+                          S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                          T_::PipeInterleave == true)
+        {
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+            return FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl{};
+        }
+        else if constexpr(std::is_same_v<typename Problem::YDataType, ck_tile::fp16_t> &&
+                          std::is_same_v<typename Problem::DDataType, ck_tile::fp16_t> &&
+                          std::is_same_v<typename Problem::TopkWeightDataType, float> &&
+                          S_::Block_M1 == 32 && S_::Block_N1 == 128 && S_::Block_K1 == 512 &&
+                          S_::Warp_M0 == 16 && S_::Warp_N0 == 16 && S_::Warp_K0 == 32 &&
+                          T_::PipeInterleave == true)
+        {
+            // return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16{};
+            return FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl{};
        }
    }
 };

--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp
@@ -22,7 +22,8 @@ template <bool IsGateOnly_,
          FusedMoeGemmWeightPermuteEnum PermuteEnum_ =
              FusedMoeGemmWeightPermuteEnum::b_nr_kr_waveflatten,
          bool PadHiddenSize_       = false,
-          bool PadIntermediateSize_ = false>
+          bool PadIntermediateSize_ = false,
+          bool PipeInterleave_      = true>
 struct FusedMoeGemmTraits
 {
    // Gate+Up or Gate only
@@ -32,6 +33,7 @@ struct FusedMoeGemmTraits
    static constexpr FusedMoeGemmWeightPermuteEnum PermuteEnum = PermuteEnum_;
    static constexpr bool PadHiddenSize                        = PadHiddenSize_;
    static constexpr bool PadIntermediateSize                  = PadIntermediateSize_;
+    static constexpr bool PipeInterleave                       = PipeInterleave_;
 };

 // Note: this need to be a bit mask

--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -52,6 +52,9 @@ using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,          S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   160,    64,   8,   8,  16,   16,    8,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,          S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   160,    64,   8,   8,  32,   32,    1,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,          S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   160,   128,    64,   8,   8,  32,   32,    5,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>

--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_f8_f8_bf16/device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -42,6 +42,7 @@ using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std
        //##################################|        |        |         |        | Type|  Type|         Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
        //##################################|        |        |         |        |     |      |             |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //##################################|        |        |         |        |     |      |             |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+        
 #ifdef __gfx94__
        // Compute friendly
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType, BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
@@ -72,6 +73,7 @@ using device_batched_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_mem_instances = std:
        //##################################|        |        |         |        | Type|  Type|         Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
        //##################################|        |        |         |        |     |      |             |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
        //##################################|        |        |         |        |     |      |             |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+
 #if defined(__gfx94__) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    128, 16,  16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               S<2>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,    Row,     F8,     F8, DsDataType,   BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    128, 16,  16,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               S<4>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1, F8>,

--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
@@ -41,6 +41,8 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = st
        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -49,7 +51,9 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = st
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    32,   8,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32,  8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
    // clang-format on
    >;

@@ -61,14 +65,21 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std
        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |

-        // Latency friendly
+       // Latency friendly
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   4,   4,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   2,   2,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
        // Memory friendly
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   2,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   4,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   4,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
@@ -82,6 +93,7 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
    // clang-format on
    >;

--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
@@ -42,14 +42,21 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = st
        
        // Compute friendly
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   4,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    32,   2,   2,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        // AGPR Spill
-        // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        // AGPR Spill when use permuted lds layout. so, use padding for these two.
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  16,   16,    8,    8,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+     
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -68,15 +75,23 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std
        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |

-        // Latency friendly 
+       // Latency friendly 
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
        // Memory friendly
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   4,   4,  32,   32,    2,    1,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   4,   4,  32,   32,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
@@ -84,12 +99,16 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   4,   4,  32,   32,    1,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   2,   2,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
    // clang-format on
    >;
 } // namespace instance

--- a/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_batched_impl.hpp
@@ -48,6 +48,7 @@ bool profile_gemm_universal_batched_impl(int do_verification,
                                         int StrideB,
                                         int StrideC,
                                         int BatchCount,
+                                         int KBatch,
                                         int n_warmup,
                                         int n_iter,
                                         uint64_t rotating = 0)
@@ -147,89 +148,100 @@ bool profile_gemm_universal_batched_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;

    // profile device op instances
    for(auto& op_ptr : op_ptrs)
    {
-        std::unique_ptr<tensor_operation::device::BaseArgument> argument_ptr;
-        // false branch for multi d dl kernel
-
-        argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                        {},
-                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                        M,
-                                        N,
-                                        K,
-                                        BatchCount,
-                                        StrideA,
-                                        StrideB,
-                                        {},
-                                        StrideC,
-                                        BatchStrideA,
-                                        BatchStrideB,
-                                        {},
-                                        BatchStrideC,
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{},
-                                        ck::tensor_operation::element_wise::PassThrough{});
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            // re-init C to zero before profiling next kernel
-            c_device_buf.SetZero();
-
-            std::string op_name = op_ptr->GetTypeString();
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};

-            float ave_time = invoker_ptr->Run(
-                argument_ptr.get(),
-                StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter, true, rotating_count});
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }

-            std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr =
+                op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                            {},
+                                            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                            M,
+                                            N,
+                                            K,
+                                            BatchCount,
+                                            StrideA,
+                                            StrideB,
+                                            {},
+                                            StrideC,
+                                            BatchStrideA,
+                                            BatchStrideB,
+                                            {},
+                                            BatchStrideC,
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            ck::tensor_operation::element_wise::PassThrough{},
+                                            kbatch_curr);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                std::string op_name = op_ptr->GetTypeString();

-            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                                     sizeof(CDataType) * M * N) *
-                                    BatchCount;
+                float ave_time = invoker_ptr->Run(
+                    argument_ptr.get(),
+                    StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter, true, rotating_count});

-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+                std::size_t flop = std::size_t(2) * BatchCount * M * N * K;

-            float gb_per_sec = num_btype / 1.E6 / ave_time;
+                std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                         sizeof(CDataType) * M * N) *
+                                        BatchCount;

-            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s, " << op_name << std::endl;
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

-            if(tflops > best_tflops)
-            {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
-            }
+                float gb_per_sec = num_btype / 1.E6 / ave_time;

-            if(do_verification)
-            {
-                c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+                std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                          << " GB/s, " << op_name << ", KBatch " << kbatch_curr << std::endl;

-                pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
+                if(tflops > best_tflops)
+                {
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_ave_time   = ave_time;
+                    best_gb_per_sec = gb_per_sec;
+                    best_kbatch     = kbatch_curr;
+                }

-                if(do_log)
+                if(do_verification)
                {
-                    LogRangeAsType<float>(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host: ", c_g_m_n_host_result.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "c_device: ", c_g_m_n_device_result.mData, ",")
-                        << std::endl;
+                    c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+
+                    pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host: ", c_g_m_n_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", c_g_m_n_device_result.mData, ",")
+                            << std::endl;
+                    }
                }
            }
-        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
        }
    }

@@ -270,8 +282,8 @@ bool profile_gemm_universal_batched_impl(int do_verification,

    std::cout << " B = " << BatchCount << " M = " << M << " N = " << N << " K = " << K
              << " StrideA = " << StrideA << " StrideB = " << StrideB << " StrideC = " << StrideC
-              << ": " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
-              << " GB/s, " << best_op_name << std::endl;
+              << " KBatch = " << best_kbatch << ": " << best_ave_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

    return pass;
 }

--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -144,6 +144,7 @@ bool profile_gemm_universal_impl(int do_verification,
    }

    std::string best_op_name;
+    std::optional<std::string> best_op_object_name;
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
@@ -225,7 +226,8 @@ bool profile_gemm_universal_impl(int do_verification,
                    }
                }

-                std::string op_name = op_ptr->GetTypeString();
+                std::string op_name                    = op_ptr->GetTypeString();
+                std::optional<std::string> op_obj_name = op_ptr->GetObjectName();

                float ave_time = invoker_ptr->Run(argument_ptr.get(),
                                                  StreamConfig{nullptr,
@@ -251,11 +253,12 @@ bool profile_gemm_universal_impl(int do_verification,

                if(tflops > best_tflops && ave_time > 1e-10)
                {
-                    best_op_name    = op_name;
-                    best_tflops     = tflops;
-                    best_ave_time   = ave_time;
-                    best_gb_per_sec = gb_per_sec;
-                    best_kbatch     = kbatch_curr;
+                    best_op_name        = op_name;
+                    best_op_object_name = op_obj_name;
+                    best_tflops         = tflops;
+                    best_ave_time       = ave_time;
+                    best_gb_per_sec     = gb_per_sec;
+                    best_kbatch         = kbatch_curr;
                }
            }
            else
@@ -306,6 +309,9 @@ bool profile_gemm_universal_impl(int do_verification,
              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
              << " GB/s, " << best_op_name << std::endl;

+    if(best_op_object_name)
+        std::cout << best_op_object_name.value() << std::endl;
+
    return pass;
 }


--- a/profiler/src/profile_gemm_universal_batched.cpp
+++ b/profiler/src/profile_gemm_universal_batched.cpp
@@ -31,7 +31,7 @@ enum struct GemmDataType

 int profile_batched_gemm_universal(int argc, char* argv[])
 {
-    if(argc != 18 && argc != 21)
+    if(argc != 19 && argc != 22)
    {
        // clang-format off
        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
@@ -44,11 +44,11 @@ int profile_batched_gemm_universal(int argc, char* argv[])
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
        printf("arg6: print tensor value (0: no; 1: yes)\n");
        printf("arg7: time kernel (0=n0, 1=yes)\n");
-        printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
+        printf("arg8 to 18: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount, KBatch\n");
        printf("optional:\n");
-        printf("arg18: number of warm-up cycles (default 1)\n");
-        printf("arg19: number of iterations (default 10)\n");
-        printf("arg20: memory for rotating buffer (default 0, size in MB)\n");
+        printf("arg19: number of warm-up cycles (default 1)\n");
+        printf("arg20: number of iterations (default 10)\n");
+        printf("arg21: memory for rotating buffer (default 0, size in MB)\n");
        // clang-format on
        exit(1);
    }
@@ -56,11 +56,11 @@ int profile_batched_gemm_universal(int argc, char* argv[])
    int n_warmup      = 1;
    int n_iter        = 10;
    uint64_t rotating = 0;
-    if(argc == 21)
+    if(argc == 22)
    {
-        n_warmup = std::stoi(argv[18]);
-        n_iter   = std::stoi(argv[19]);
-        rotating = std::stoull(argv[20]) * 1024 * 1024;
+        n_warmup = std::stoi(argv[19]);
+        n_iter   = std::stoi(argv[20]);
+        rotating = std::stoull(argv[21]) * 1024 * 1024;
    }

    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
@@ -83,6 +83,7 @@ int profile_batched_gemm_universal(int argc, char* argv[])
    const int BatchStrideC = std::stoi(argv[16]);

    const int BatchCount = std::stoi(argv[17]);
+    const int KBatch     = std::stoi(argv[18]);

 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
    using F8 = ck::f8_t;
@@ -159,6 +160,7 @@ int profile_batched_gemm_universal(int argc, char* argv[])
                                                                                    StrideB_,
                                                                                    StrideC_,
                                                                                    BatchCount,
+                                                                                    KBatch,
                                                                                    n_warmup,
                                                                                    n_iter,
                                                                                    rotating);

--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -332,7 +332,7 @@ def main():
            table_name="ck_fmha_bwd_tflops"

        tflops_base = get_baseline(table_name,conn)
-        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)
+        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, sqlEngine)
        conn.close()

    #compare the results to the baseline if baseline exists

--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
 # Currently ck_tile is only built on gfx9
 if(GPU_TARGETS MATCHES "gfx9")
-    add_gtest_executable(test_ck_tile_gemm_mem_pipeline test_gemm_mem_pipeline.cpp)
+    add_gtest_executable(test_ck_tile_gemm_pipeline test_gemm_pipeline.cpp)
 endif()
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline.cpp
@@ -6,7 +6,7 @@
 #include "gtest/gtest.h"

 #include "ck_tile/host.hpp"
-#include "test_gemm_mem_pipeline_util.hpp"
+#include "test_gemm_pipeline_util.hpp"

 using F16       = ck_tile::half_t;
 using F32       = float;
@@ -16,21 +16,27 @@ using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
                                             ck_tile::GemmPipelineScheduler::Intrawave>;
 using Interwave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
                                             ck_tile::GemmPipelineScheduler::Interwave>;
+using Mem       = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
+using Comp      = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Comp>;

 // clang-format off
 using KernelTypes = ::testing::Types<
-    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
-    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16,             Interwave>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16,             Interwave>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
-    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16,             Interwave>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16,             Intrawave>,
-    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16,             Interwave>
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler, PipelineType
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,         Mem>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Intrawave,        Comp>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,             Interwave,         Mem>
    >;
 // clang-format on

-TYPED_TEST_SUITE(TestCkTileGemmMemPipeline, KernelTypes);
+TYPED_TEST_SUITE(TestCkTileGemmPipeline, KernelTypes);

-#include "test_gemm_mem_pipeline_ut_cases.inc"
+#include "test_gemm_pipeline_ut_cases.inc"
--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_ut_cases.inc
@@ -3,7 +3,7 @@

 #pragma once

-TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
+TYPED_TEST(TestCkTileGemmPipeline, SmallM)
 {
    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 1024;
@@ -13,7 +13,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, SmallM)
        this->Run(M, N, K);
 }

-TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
+TYPED_TEST(TestCkTileGemmPipeline, MidLargeM)
 {
    std::vector<int> Ms{127, 255, 312, 799, 1573};
    constexpr int N = 1024;
@@ -23,7 +23,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, MidLargeM)
        this->Run(M, N, K);
 }

-TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
+TYPED_TEST(TestCkTileGemmPipeline, PaddK)
 {
    std::vector<int> Ms{127};
    constexpr int N = 1024;
@@ -33,7 +33,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, PaddK)
        this->Run(M, N, K);
 }

-TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
+TYPED_TEST(TestCkTileGemmPipeline, Regular)
 {
    std::vector<int> Ms{512};
    constexpr int N = 1024;
@@ -43,7 +43,7 @@ TYPED_TEST(TestCkTileGemmMemPipeline, Regular)
        this->Run(M, N, K);
 }

-TYPED_TEST(TestCkTileGemmMemPipeline, NotSupportedArgument)
+TYPED_TEST(TestCkTileGemmPipeline, NotSupportedArgument)
 {
    constexpr int M = 512;
    constexpr int N = 1025;

--- a/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_mem_pipeline_util.hpp
@@ -11,18 +11,24 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"

+enum struct GemmPipelineType
+{
+    Mem,
+    Comp
+};
 template <typename Tuple>
-class TestCkTileGemmMemPipeline : public ::testing::Test
+class TestCkTileGemmPipeline : public ::testing::Test
 {
    protected:
-    using ALayout                   = std::tuple_element_t<0, Tuple>;
-    using BLayout                   = std::tuple_element_t<1, Tuple>;
-    using CLayout                   = std::tuple_element_t<2, Tuple>;
-    using ADataType                 = std::tuple_element_t<3, Tuple>;
-    using BDataType                 = std::tuple_element_t<4, Tuple>;
-    using AccDataType               = std::tuple_element_t<5, Tuple>;
-    using CDataType                 = std::tuple_element_t<6, Tuple>;
-    static constexpr auto Scheduler = std::tuple_element_t<7, Tuple>::value;
+    using ALayout                      = std::tuple_element_t<0, Tuple>;
+    using BLayout                      = std::tuple_element_t<1, Tuple>;
+    using CLayout                      = std::tuple_element_t<2, Tuple>;
+    using ADataType                    = std::tuple_element_t<3, Tuple>;
+    using BDataType                    = std::tuple_element_t<4, Tuple>;
+    using AccDataType                  = std::tuple_element_t<5, Tuple>;
+    using CDataType                    = std::tuple_element_t<6, Tuple>;
+    static constexpr auto Scheduler    = std::tuple_element_t<7, Tuple>::value;
+    static constexpr auto PipelineType = std::tuple_element_t<8, Tuple>::value;
    // TODO: expose tile size through test t-param ?

    struct gemm_args
@@ -74,8 +80,13 @@ class TestCkTileGemmMemPipeline : public ::testing::Test

        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;

-        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<
-            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>;
+        using BaseGemmPipeline = std::conditional_t<
+            PipelineType == GemmPipelineType::Mem,
+            ck_tile::BaseGemmPipelineAgBgCrMem<
+                ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>,
+            ck_tile::BaseGemmPipelineAgBgCrCompV3<
+                ck_tile::
+                    GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>>>;

        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(args.K);
        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
@@ -85,15 +96,26 @@ class TestCkTileGemmMemPipeline : public ::testing::Test
            constexpr bool has_hot_loop_v = has_hot_loop_.value;
            constexpr auto tail_number_v  = tail_number_.value;

-            using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<
-                ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                      BDataType,
-                                                      AccDataType,
-                                                      GemmShape,
-                                                      Traits,
-                                                      Scheduler,
-                                                      has_hot_loop_v,
-                                                      tail_number_v>>;
+            using GemmPipeline =
+                std::conditional_t<PipelineType == GemmPipelineType::Mem,
+                                   ck_tile::GemmPipelineAgBgCrMem<
+                                       ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                             BDataType,
+                                                                             AccDataType,
+                                                                             GemmShape,
+                                                                             Traits,
+                                                                             Scheduler,
+                                                                             has_hot_loop_v,
+                                                                             tail_number_v>>,
+                                   ck_tile::GemmPipelineAgBgCrCompV3<
+                                       ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                             BDataType,
+                                                                             AccDataType,
+                                                                             GemmShape,
+                                                                             Traits,
+                                                                             Scheduler,
+                                                                             has_hot_loop_v,
+                                                                             tail_number_v>>>;
            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
            auto kargs   = Kernel::MakeKernelArgs(args.p_a,
                                                args.p_b,