#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
#endif

#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_INT8
#   define _UK_MFMA_ "v_mfma_i32_16x16x32_i8"

#   define _UK_PK_CVT_(x0_, x1_, y_)                           \
    "  v_cmp_u_f32   s[36:37], " x0_ ", " x0_ "          \n"   \
    "  v_add3_u32    v50, " x0_ ", %[v_nan_lo], 1        \n"   \
    "  v_cndmask_b32  v54, v50, %[v_nan_hi], s[36:37]    \n"   \
    "  v_cmp_u_f32   s[36:37], " x1_ ", " x1_ "          \n"   \
    "  v_add3_u32    v50, " x1_ ", %[v_nan_lo], 1        \n"   \
    "  v_cndmask_b32  v55, v50, %[v_nan_hi], s[36:37]    \n"   \
    "  v_perm_b32    " y_ ", v55, v54, s52               \n"

#   define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"

#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"

#   define _UK_PK_CVT_(x0_, x1_, y_)                \
    "  v_cvt_f16_f32  v54, " x0_ "  \n"             \
    "  v_cvt_f16_f32  v55, " x1_ "  \n"             \
    "  v_pack_b32_f16 " y_ ", v54, v55  \n"

#   define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"

#endif
" s_mov_b32 s8,    %[s_res_o0] \n"
" s_mov_b32 s9,    %[s_res_o1] \n"
" s_mov_b32 s12,    %[s_res_b0] \n"
" s_mov_b32 s13,    %[s_res_b1] \n"
" s_mov_b32 s14,    %[s_res_b2] \n"
" s_mov_b32 s15,    %[s_res_b3] \n"
"  s_waitcnt     vmcnt(24)                              \n"
"  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[12:15], 0 offen\n"
"  v_mul_f32     v54, v128, v128                        \n"
"  v_mul_f32     v55, v129, v129                        \n"
"  v_mul_f32     v56, v130, v130                        \n"
"  v_mul_f32     v57, v131, v131                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v128                         \n"
"  v_mul_f32     v55, v55, v129                         \n"
"  v_mul_f32     v56, v56, v130                         \n"
"  v_mul_f32     v57, v57, v131                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v128, v128, v54                        \n"
"  v_mul_f32     v129, v129, v55                        \n"
"  v_mul_f32     v130, v130, v56                        \n"
"  v_mul_f32     v131, v131, v57                        \n"
"  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
"  v_mul_f32     v54, v132, v132                        \n"
"  v_mul_f32     v55, v133, v133                        \n"
"  v_mul_f32     v56, v134, v134                        \n"
"  v_mul_f32     v57, v135, v135                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v132                         \n"
"  v_mul_f32     v55, v55, v133                         \n"
"  v_mul_f32     v56, v56, v134                         \n"
"  v_mul_f32     v57, v57, v135                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v132, v132, v54                        \n"
"  v_mul_f32     v133, v133, v55                        \n"
"  v_mul_f32     v134, v134, v56                        \n"
"  v_mul_f32     v135, v135, v57                        \n"
"  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[12:15], 0 offen\n"
"  v_mul_f32     v54, v136, v136                        \n"
"  v_mul_f32     v55, v137, v137                        \n"
"  v_mul_f32     v56, v138, v138                        \n"
"  v_mul_f32     v57, v139, v139                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v136                         \n"
"  v_mul_f32     v55, v55, v137                         \n"
"  v_mul_f32     v56, v56, v138                         \n"
"  v_mul_f32     v57, v57, v139                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v136, v136, v54                        \n"
"  v_mul_f32     v137, v137, v55                        \n"
"  v_mul_f32     v138, v138, v56                        \n"
"  v_mul_f32     v139, v139, v57                        \n"
"  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
"  v_mul_f32     v54, v140, v140                        \n"
"  v_mul_f32     v55, v141, v141                        \n"
"  v_mul_f32     v56, v142, v142                        \n"
"  v_mul_f32     v57, v143, v143                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v140                         \n"
"  v_mul_f32     v55, v55, v141                         \n"
"  v_mul_f32     v56, v56, v142                         \n"
"  v_mul_f32     v57, v57, v143                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v140, v140, v54                        \n"
"  v_mul_f32     v141, v141, v55                        \n"
"  v_mul_f32     v142, v142, v56                        \n"
"  v_mul_f32     v143, v143, v57                        \n"
"  s_waitcnt     vmcnt(24)                              \n"
"  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[12:15], 0 offen\n"
"  v_mul_f32     v54, v144, v144                        \n"
"  v_mul_f32     v55, v145, v145                        \n"
"  v_mul_f32     v56, v146, v146                        \n"
"  v_mul_f32     v57, v147, v147                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v144                         \n"
"  v_mul_f32     v55, v55, v145                         \n"
"  v_mul_f32     v56, v56, v146                         \n"
"  v_mul_f32     v57, v57, v147                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v144, v144, v54                        \n"
"  v_mul_f32     v145, v145, v55                        \n"
"  v_mul_f32     v146, v146, v56                        \n"
"  v_mul_f32     v147, v147, v57                        \n"
"  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
"  v_mul_f32     v54, v148, v148                        \n"
"  v_mul_f32     v55, v149, v149                        \n"
"  v_mul_f32     v56, v150, v150                        \n"
"  v_mul_f32     v57, v151, v151                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v148                         \n"
"  v_mul_f32     v55, v55, v149                         \n"
"  v_mul_f32     v56, v56, v150                         \n"
"  v_mul_f32     v57, v57, v151                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v148, v148, v54                        \n"
"  v_mul_f32     v149, v149, v55                        \n"
"  v_mul_f32     v150, v150, v56                        \n"
"  v_mul_f32     v151, v151, v57                        \n"
"  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[12:15], 0 offen\n"
"  v_mul_f32     v54, v152, v152                        \n"
"  v_mul_f32     v55, v153, v153                        \n"
"  v_mul_f32     v56, v154, v154                        \n"
"  v_mul_f32     v57, v155, v155                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v152                         \n"
"  v_mul_f32     v55, v55, v153                         \n"
"  v_mul_f32     v56, v56, v154                         \n"
"  v_mul_f32     v57, v57, v155                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v152, v152, v54                        \n"
"  v_mul_f32     v153, v153, v55                        \n"
"  v_mul_f32     v154, v154, v56                        \n"
"  v_mul_f32     v155, v155, v57                        \n"
"  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
"  v_mul_f32     v54, v156, v156                        \n"
"  v_mul_f32     v55, v157, v157                        \n"
"  v_mul_f32     v56, v158, v158                        \n"
"  v_mul_f32     v57, v159, v159                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v156                         \n"
"  v_mul_f32     v55, v55, v157                         \n"
"  v_mul_f32     v56, v56, v158                         \n"
"  v_mul_f32     v57, v57, v159                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
"  s_add_u32     s12, %[s_tile_os_b_half], s12                          \n"
"  s_addc_u32    s13, 0, s13                            \n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v156, v156, v54                        \n"
"  v_mul_f32     v157, v157, v55                        \n"
"  v_mul_f32     v158, v158, v56                        \n"
"  v_mul_f32     v159, v159, v57                        \n"
"  s_waitcnt     vmcnt(24)                              \n"
"  buffer_load_dwordx4  acc[64:67], %[v_os_b0], s[12:15], 0 offen\n"
"  v_mul_f32     v54, v160, v160                        \n"
"  v_mul_f32     v55, v161, v161                        \n"
"  v_mul_f32     v56, v162, v162                        \n"
"  v_mul_f32     v57, v163, v163                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v160                         \n"
"  v_mul_f32     v55, v55, v161                         \n"
"  v_mul_f32     v56, v56, v162                         \n"
"  v_mul_f32     v57, v57, v163                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v160, v160, v54                        \n"
"  v_mul_f32     v161, v161, v55                        \n"
"  v_mul_f32     v162, v162, v56                        \n"
"  v_mul_f32     v163, v163, v57                        \n"
"  buffer_load_dwordx4  acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
"  v_mul_f32     v54, v164, v164                        \n"
"  v_mul_f32     v55, v165, v165                        \n"
"  v_mul_f32     v56, v166, v166                        \n"
"  v_mul_f32     v57, v167, v167                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v164                         \n"
"  v_mul_f32     v55, v55, v165                         \n"
"  v_mul_f32     v56, v56, v166                         \n"
"  v_mul_f32     v57, v57, v167                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v164, v164, v54                        \n"
"  v_mul_f32     v165, v165, v55                        \n"
"  v_mul_f32     v166, v166, v56                        \n"
"  v_mul_f32     v167, v167, v57                        \n"
"  buffer_load_dwordx4  acc[80:83], %[v_os_b1], s[12:15], 0 offen\n"
"  v_mul_f32     v54, v168, v168                        \n"
"  v_mul_f32     v55, v169, v169                        \n"
"  v_mul_f32     v56, v170, v170                        \n"
"  v_mul_f32     v57, v171, v171                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v168                         \n"
"  v_mul_f32     v55, v55, v169                         \n"
"  v_mul_f32     v56, v56, v170                         \n"
"  v_mul_f32     v57, v57, v171                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v168, v168, v54                        \n"
"  v_mul_f32     v169, v169, v55                        \n"
"  v_mul_f32     v170, v170, v56                        \n"
"  v_mul_f32     v171, v171, v57                        \n"
"  buffer_load_dwordx4  acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
"  v_mul_f32     v54, v172, v172                        \n"
"  v_mul_f32     v55, v173, v173                        \n"
"  v_mul_f32     v56, v174, v174                        \n"
"  v_mul_f32     v57, v175, v175                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v172                         \n"
"  v_mul_f32     v55, v55, v173                         \n"
"  v_mul_f32     v56, v56, v174                         \n"
"  v_mul_f32     v57, v57, v175                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v172, v172, v54                        \n"
"  v_mul_f32     v173, v173, v55                        \n"
"  v_mul_f32     v174, v174, v56                        \n"
"  v_mul_f32     v175, v175, v57                        \n"
"  s_waitcnt     vmcnt(24)                              \n"
"  buffer_load_dwordx4  acc[96:99], %[v_os_b2], s[12:15], 0 offen\n"
"  v_mul_f32     v54, v176, v176                        \n"
"  v_mul_f32     v55, v177, v177                        \n"
"  v_mul_f32     v56, v178, v178                        \n"
"  v_mul_f32     v57, v179, v179                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v176                         \n"
"  v_mul_f32     v55, v55, v177                         \n"
"  v_mul_f32     v56, v56, v178                         \n"
"  v_mul_f32     v57, v57, v179                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v176, v176, v54                        \n"
"  v_mul_f32     v177, v177, v55                        \n"
"  v_mul_f32     v178, v178, v56                        \n"
"  v_mul_f32     v179, v179, v57                        \n"
"  buffer_load_dwordx4  acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
"  v_mul_f32     v54, v180, v180                        \n"
"  v_mul_f32     v55, v181, v181                        \n"
"  v_mul_f32     v56, v182, v182                        \n"
"  v_mul_f32     v57, v183, v183                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v180                         \n"
"  v_mul_f32     v55, v55, v181                         \n"
"  v_mul_f32     v56, v56, v182                         \n"
"  v_mul_f32     v57, v57, v183                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v180, v180, v54                        \n"
"  v_mul_f32     v181, v181, v55                        \n"
"  v_mul_f32     v182, v182, v56                        \n"
"  v_mul_f32     v183, v183, v57                        \n"
"  buffer_load_dwordx4  acc[112:115], %[v_os_b3], s[12:15], 0 offen\n"
"  v_mul_f32     v54, v184, v184                        \n"
"  v_mul_f32     v55, v185, v185                        \n"
"  v_mul_f32     v56, v186, v186                        \n"
"  v_mul_f32     v57, v187, v187                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v184                         \n"
"  v_mul_f32     v55, v55, v185                         \n"
"  v_mul_f32     v56, v56, v186                         \n"
"  v_mul_f32     v57, v57, v187                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v184, v184, v54                        \n"
"  v_mul_f32     v185, v185, v55                        \n"
"  v_mul_f32     v186, v186, v56                        \n"
"  v_mul_f32     v187, v187, v57                        \n"
"  buffer_load_dwordx4  acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
"  v_mul_f32     v54, v188, v188                        \n"
"  v_mul_f32     v55, v189, v189                        \n"
"  v_mul_f32     v56, v190, v190                        \n"
"  v_mul_f32     v57, v191, v191                        \n"
"  v_fma_f32     v54, v54, s77, v1                      \n"
"  v_fma_f32     v55, v55, s77, v1                      \n"
"  v_fma_f32     v56, v56, s77, v1                      \n"
"  v_fma_f32     v57, v57, s77, v1                      \n"
"  v_mul_f32     v54, v54, v188                         \n"
"  v_mul_f32     v55, v55, v189                         \n"
"  v_mul_f32     v56, v56, v190                         \n"
"  v_mul_f32     v57, v57, v191                         \n"
"  v_mul_f32     v54, v54, s6                           \n"
"  v_mul_f32     v55, v55, s6                           \n"
"  v_mul_f32     v56, v56, s6                           \n"
"  v_mul_f32     v57, v57, s6                           \n"
"  v_exp_f32     v54, v54                               \n"
"  v_exp_f32     v55, v55                               \n"
"  v_exp_f32     v56, v56                               \n"
"  v_exp_f32     v57, v57                               \n"
"  buffer_load_dwordx4  acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
"  v_add_f32     v54, v54, 1.0                          \n"
"  v_add_f32     v55, v55, 1.0                          \n"
"  v_add_f32     v56, v56, 1.0                          \n"
"  v_add_f32     v57, v57, 1.0                          \n"
"  v_rcp_f32     v54, v54                               \n"
"  v_rcp_f32     v55, v55                               \n"
"  v_rcp_f32     v56, v56                               \n"
"  v_rcp_f32     v57, v57                               \n"
"  v_mul_f32     v188, v188, v54                        \n"
"  v_mul_f32     v189, v189, v55                        \n"
"  v_mul_f32     v190, v190, v56                        \n"
"  v_mul_f32     v191, v191, v57                        \n"
"  v_mul_f32     v128, v18, v128 row_newbcast:0         \n"
"  v_mul_f32     v129, v18, v129 row_newbcast:1         \n"
"  v_mul_f32     v130, v18, v130 row_newbcast:2         \n"
"  v_mul_f32     v131, v18, v131 row_newbcast:3         \n"
"  v_mul_f32     v132, v18, v132 row_newbcast:0         \n"
"  v_mul_f32     v133, v18, v133 row_newbcast:1         \n"
"  v_mul_f32     v134, v18, v134 row_newbcast:2         \n"
"  v_mul_f32     v135, v18, v135 row_newbcast:3         \n"
"  v_mul_f32     v136, v18, v136 row_newbcast:4         \n"
"  v_mul_f32     v137, v18, v137 row_newbcast:5         \n"
"  v_mul_f32     v138, v18, v138 row_newbcast:6         \n"
"  v_mul_f32     v139, v18, v139 row_newbcast:7         \n"
"  v_mul_f32     v140, v18, v140 row_newbcast:4         \n"
"  v_mul_f32     v141, v18, v141 row_newbcast:5         \n"
"  v_mul_f32     v142, v18, v142 row_newbcast:6         \n"
"  v_mul_f32     v143, v18, v143 row_newbcast:7         \n"
"  v_mul_f32     v144, v18, v144 row_newbcast:8         \n"
"  v_mul_f32     v145, v18, v145 row_newbcast:9         \n"
"  v_mul_f32     v146, v18, v146 row_newbcast:10        \n"
"  v_mul_f32     v147, v18, v147 row_newbcast:11        \n"
"  v_mul_f32     v148, v18, v148 row_newbcast:8         \n"
"  v_mul_f32     v149, v18, v149 row_newbcast:9         \n"
"  v_mul_f32     v150, v18, v150 row_newbcast:10        \n"
"  v_mul_f32     v151, v18, v151 row_newbcast:11        \n"
"  v_mul_f32     v152, v18, v152 row_newbcast:12        \n"
"  v_mul_f32     v153, v18, v153 row_newbcast:13        \n"
"  v_mul_f32     v154, v18, v154 row_newbcast:14        \n"
"  v_mul_f32     v155, v18, v155 row_newbcast:15        \n"
"  v_mul_f32     v156, v18, v156 row_newbcast:12        \n"
"  v_mul_f32     v157, v18, v157 row_newbcast:13        \n"
"  v_mul_f32     v158, v18, v158 row_newbcast:14        \n"
"  v_mul_f32     v159, v18, v159 row_newbcast:15        \n"
"  v_mul_f32     v160, v19, v160 row_newbcast:0         \n"
"  v_mul_f32     v161, v19, v161 row_newbcast:1         \n"
"  v_mul_f32     v162, v19, v162 row_newbcast:2         \n"
"  v_mul_f32     v163, v19, v163 row_newbcast:3         \n"
"  v_mul_f32     v164, v19, v164 row_newbcast:0         \n"
"  v_mul_f32     v165, v19, v165 row_newbcast:1         \n"
"  v_mul_f32     v166, v19, v166 row_newbcast:2         \n"
"  v_mul_f32     v167, v19, v167 row_newbcast:3         \n"
"  v_mul_f32     v168, v19, v168 row_newbcast:4         \n"
"  v_mul_f32     v169, v19, v169 row_newbcast:5         \n"
"  v_mul_f32     v170, v19, v170 row_newbcast:6         \n"
"  v_mul_f32     v171, v19, v171 row_newbcast:7         \n"
"  v_mul_f32     v172, v19, v172 row_newbcast:4         \n"
"  v_mul_f32     v173, v19, v173 row_newbcast:5         \n"
"  v_mul_f32     v174, v19, v174 row_newbcast:6         \n"
"  v_mul_f32     v175, v19, v175 row_newbcast:7         \n"
"  v_mul_f32     v176, v19, v176 row_newbcast:8         \n"
"  v_mul_f32     v177, v19, v177 row_newbcast:9         \n"
"  v_mul_f32     v178, v19, v178 row_newbcast:10        \n"
"  v_mul_f32     v179, v19, v179 row_newbcast:11        \n"
"  v_mul_f32     v180, v19, v180 row_newbcast:8         \n"
"  v_mul_f32     v181, v19, v181 row_newbcast:9         \n"
"  v_mul_f32     v182, v19, v182 row_newbcast:10        \n"
"  v_mul_f32     v183, v19, v183 row_newbcast:11        \n"
"  v_mul_f32     v184, v19, v184 row_newbcast:12        \n"
"  v_mul_f32     v185, v19, v185 row_newbcast:13        \n"
"  v_mul_f32     v186, v19, v186 row_newbcast:14        \n"
"  v_mul_f32     v187, v19, v187 row_newbcast:15        \n"
"  v_mul_f32     v188, v19, v188 row_newbcast:12        \n"
"  v_mul_f32     v189, v19, v189 row_newbcast:13        \n"
"  v_mul_f32     v190, v19, v190 row_newbcast:14        \n"
"  v_mul_f32     v191, v19, v191 row_newbcast:15        \n"
"  buffer_load_dword  v12, v5, s[16:19], 0 offen        \n"
"  v_mov_b32     v22, 0x358637bd                        \n"
"  v_mov_b32     v23, 0x358637bd                        \n"
"  v_max3_f32    v22, abs(v128), abs(v129), v22         \n"
"  v_max3_f32    v22, abs(v130), abs(v131), v22         \n"
"  v_max3_f32    v23, abs(v132), abs(v133), v23         \n"
"  v_max3_f32    v23, abs(v134), abs(v135), v23         \n"
"  v_max3_f32    v22, abs(v136), abs(v137), v22         \n"
"  v_max3_f32    v22, abs(v138), abs(v139), v22         \n"
"  v_max3_f32    v23, abs(v140), abs(v141), v23         \n"
"  v_max3_f32    v23, abs(v142), abs(v143), v23         \n"
"  v_max3_f32    v22, abs(v144), abs(v145), v22         \n"
"  v_max3_f32    v22, abs(v146), abs(v147), v22         \n"
"  v_max3_f32    v23, abs(v148), abs(v149), v23         \n"
"  v_max3_f32    v23, abs(v150), abs(v151), v23         \n"
"  v_max3_f32    v22, abs(v152), abs(v153), v22         \n"
"  v_max3_f32    v22, abs(v154), abs(v155), v22         \n"
"  v_max3_f32    v23, abs(v156), abs(v157), v23         \n"
"  v_max3_f32    v23, abs(v158), abs(v159), v23         \n"
"  v_max3_f32    v22, abs(v160), abs(v161), v22         \n"
"  v_max3_f32    v22, abs(v162), abs(v163), v22         \n"
"  v_max3_f32    v23, abs(v164), abs(v165), v23         \n"
"  v_max3_f32    v23, abs(v166), abs(v167), v23         \n"
"  v_max3_f32    v22, abs(v168), abs(v169), v22         \n"
"  v_max3_f32    v22, abs(v170), abs(v171), v22         \n"
"  v_max3_f32    v23, abs(v172), abs(v173), v23         \n"
"  v_max3_f32    v23, abs(v174), abs(v175), v23         \n"
"  v_max3_f32    v22, abs(v176), abs(v177), v22         \n"
"  v_max3_f32    v22, abs(v178), abs(v179), v22         \n"
"  v_max3_f32    v23, abs(v180), abs(v181), v23         \n"
"  v_max3_f32    v23, abs(v182), abs(v183), v23         \n"
"  v_max3_f32    v22, abs(v184), abs(v185), v22         \n"
"  v_max3_f32    v22, abs(v186), abs(v187), v22         \n"
"  v_max3_f32    v23, abs(v188), abs(v189), v23         \n"
"  v_max3_f32    v23, abs(v190), abs(v191), v23         \n"
"  v_lshlrev_b32  v54, 3, v0                            \n"
"  s_mul_i32     s60, 0x00000200, s7                    \n"
"  v_add_u32     v54, s60, v54                          \n"
"  ds_write_b64  v54, v[22:23] offset:16640             \n"
"  s_waitcnt     lgkmcnt(0)                             \n"
"  s_barrier                                            \n"
"  v_and_b32     v54, 15, v0                            \n"
"  v_lshlrev_b32  v54, 3, v54                           \n"
"  ds_read_b64   v[96:97], v54 offset:16640             \n"
"  ds_read_b64   v[98:99], v54 offset:16768             \n"
"  ds_read_b64   v[100:101], v54 offset:16896           \n"
"  ds_read_b64   v[102:103], v54 offset:17024           \n"
"  ds_read_b64   v[104:105], v54 offset:17152           \n"
"  ds_read_b64   v[106:107], v54 offset:17280           \n"
"  ds_read_b64   v[108:109], v54 offset:17408           \n"
"  ds_read_b64   v[110:111], v54 offset:17536           \n"
"  ds_read_b64   v[112:113], v54 offset:17664           \n"
"  ds_read_b64   v[114:115], v54 offset:17792           \n"
"  ds_read_b64   v[116:117], v54 offset:17920           \n"
"  ds_read_b64   v[118:119], v54 offset:18048           \n"
"  ds_read_b64   v[120:121], v54 offset:18176           \n"
"  ds_read_b64   v[122:123], v54 offset:18304           \n"
"  ds_read_b64   v[124:125], v54 offset:18432           \n"
"  ds_read_b64   v[126:127], v54 offset:18560           \n"
"  s_waitcnt     lgkmcnt(0)                             \n"
"  v_max3_f32    v22, abs(v96), abs(v98), v22           \n"
"  v_max3_f32    v23, abs(v97), abs(v99), v23           \n"
"  v_max3_f32    v22, abs(v100), abs(v102), v22         \n"
"  v_max3_f32    v23, abs(v101), abs(v103), v23         \n"
"  v_max3_f32    v22, abs(v104), abs(v106), v22         \n"
"  v_max3_f32    v23, abs(v105), abs(v107), v23         \n"
"  v_max3_f32    v22, abs(v108), abs(v110), v22         \n"
"  v_max3_f32    v23, abs(v109), abs(v111), v23         \n"
"  v_max3_f32    v22, abs(v112), abs(v114), v22         \n"
"  v_max3_f32    v23, abs(v113), abs(v115), v23         \n"
"  v_max3_f32    v22, abs(v116), abs(v118), v22         \n"
"  v_max3_f32    v23, abs(v117), abs(v119), v23         \n"
"  v_max3_f32    v22, abs(v120), abs(v122), v22         \n"
"  v_max3_f32    v23, abs(v121), abs(v123), v23         \n"
"  v_max3_f32    v22, abs(v124), abs(v126), v22         \n"
"  v_max3_f32    v23, abs(v125), abs(v127), v23         \n"
"  v_rcp_f32     v22, v22                               \n"
"  v_rcp_f32     v23, v23                               \n"
"  v_mul_f32     v22, 0x42fe0000, v22                   \n"
"  v_mul_f32     v23, 0x42fe0000, v23                   \n"
"  v_mul_f32     v128, v22, v128                        \n"
"  v_mul_f32     v129, v22, v129                        \n"
"  v_mul_f32     v130, v22, v130                        \n"
"  v_mul_f32     v131, v22, v131                        \n"
"  v_cvt_i32_f32  v128, v128                            \n"
"  v_cvt_i32_f32  v129, v129                            \n"
"  v_cvt_i32_f32  v130, v130                            \n"
"  v_cvt_i32_f32  v131, v131                            \n"
"  v_perm_b32    v128, v129, v128, s53                  \n"
"  v_perm_b32    v128, v130, v128, s54                  \n"
"  v_perm_b32    v128, v131, v128, s55                  \n"
"  v_mul_f32     v132, v23, v132                        \n"
"  v_mul_f32     v133, v23, v133                        \n"
"  v_mul_f32     v134, v23, v134                        \n"
"  v_mul_f32     v135, v23, v135                        \n"
"  v_cvt_i32_f32  v132, v132                            \n"
"  v_cvt_i32_f32  v133, v133                            \n"
"  v_cvt_i32_f32  v134, v134                            \n"
"  v_cvt_i32_f32  v135, v135                            \n"
"  v_perm_b32    v129, v133, v132, s53                  \n"
"  v_perm_b32    v129, v134, v129, s54                  \n"
"  v_perm_b32    v129, v135, v129, s55                  \n"
"  v_mul_f32     v136, v22, v136                        \n"
"  v_mul_f32     v137, v22, v137                        \n"
"  v_mul_f32     v138, v22, v138                        \n"
"  v_mul_f32     v139, v22, v139                        \n"
"  v_cvt_i32_f32  v136, v136                            \n"
"  v_cvt_i32_f32  v137, v137                            \n"
"  v_cvt_i32_f32  v138, v138                            \n"
"  v_cvt_i32_f32  v139, v139                            \n"
"  v_perm_b32    v130, v137, v136, s53                  \n"
"  v_perm_b32    v130, v138, v130, s54                  \n"
"  v_perm_b32    v130, v139, v130, s55                  \n"
"  v_mul_f32     v140, v23, v140                        \n"
"  v_mul_f32     v141, v23, v141                        \n"
"  v_mul_f32     v142, v23, v142                        \n"
"  v_mul_f32     v143, v23, v143                        \n"
"  v_cvt_i32_f32  v140, v140                            \n"
"  v_cvt_i32_f32  v141, v141                            \n"
"  v_cvt_i32_f32  v142, v142                            \n"
"  v_cvt_i32_f32  v143, v143                            \n"
"  v_perm_b32    v131, v141, v140, s53                  \n"
"  v_perm_b32    v131, v142, v131, s54                  \n"
"  v_perm_b32    v131, v143, v131, s55                  \n"
"  v_mul_f32     v144, v22, v144                        \n"
"  v_mul_f32     v145, v22, v145                        \n"
"  v_mul_f32     v146, v22, v146                        \n"
"  v_mul_f32     v147, v22, v147                        \n"
"  v_cvt_i32_f32  v144, v144                            \n"
"  v_cvt_i32_f32  v145, v145                            \n"
"  v_cvt_i32_f32  v146, v146                            \n"
"  v_cvt_i32_f32  v147, v147                            \n"
"  v_perm_b32    v132, v145, v144, s53                  \n"
"  v_perm_b32    v132, v146, v132, s54                  \n"
"  v_perm_b32    v132, v147, v132, s55                  \n"
"  v_mul_f32     v148, v23, v148                        \n"
"  v_mul_f32     v149, v23, v149                        \n"
"  v_mul_f32     v150, v23, v150                        \n"
"  v_mul_f32     v151, v23, v151                        \n"
"  v_cvt_i32_f32  v148, v148                            \n"
"  v_cvt_i32_f32  v149, v149                            \n"
"  v_cvt_i32_f32  v150, v150                            \n"
"  v_cvt_i32_f32  v151, v151                            \n"
"  v_perm_b32    v133, v149, v148, s53                  \n"
"  v_perm_b32    v133, v150, v133, s54                  \n"
"  v_perm_b32    v133, v151, v133, s55                  \n"
"  v_mul_f32     v152, v22, v152                        \n"
"  v_mul_f32     v153, v22, v153                        \n"
"  v_mul_f32     v154, v22, v154                        \n"
"  v_mul_f32     v155, v22, v155                        \n"
"  v_cvt_i32_f32  v152, v152                            \n"
"  v_cvt_i32_f32  v153, v153                            \n"
"  v_cvt_i32_f32  v154, v154                            \n"
"  v_cvt_i32_f32  v155, v155                            \n"
"  v_perm_b32    v134, v153, v152, s53                  \n"
"  v_perm_b32    v134, v154, v134, s54                  \n"
"  v_perm_b32    v134, v155, v134, s55                  \n"
"  v_mul_f32     v156, v23, v156                        \n"
"  v_mul_f32     v157, v23, v157                        \n"
"  v_mul_f32     v158, v23, v158                        \n"
"  v_mul_f32     v159, v23, v159                        \n"
"  v_cvt_i32_f32  v156, v156                            \n"
"  v_cvt_i32_f32  v157, v157                            \n"
"  v_cvt_i32_f32  v158, v158                            \n"
"  v_cvt_i32_f32  v159, v159                            \n"
"  v_perm_b32    v135, v157, v156, s53                  \n"
"  v_perm_b32    v135, v158, v135, s54                  \n"
"  v_perm_b32    v135, v159, v135, s55                  \n"
"  v_mul_f32     v160, v22, v160                        \n"
"  v_mul_f32     v161, v22, v161                        \n"
"  v_mul_f32     v162, v22, v162                        \n"
"  v_mul_f32     v163, v22, v163                        \n"
"  v_cvt_i32_f32  v160, v160                            \n"
"  v_cvt_i32_f32  v161, v161                            \n"
"  v_cvt_i32_f32  v162, v162                            \n"
"  v_cvt_i32_f32  v163, v163                            \n"
"  v_perm_b32    v136, v161, v160, s53                  \n"
"  v_perm_b32    v136, v162, v136, s54                  \n"
"  v_perm_b32    v136, v163, v136, s55                  \n"
"  v_mul_f32     v164, v23, v164                        \n"
"  v_mul_f32     v165, v23, v165                        \n"
"  v_mul_f32     v166, v23, v166                        \n"
"  v_mul_f32     v167, v23, v167                        \n"
"  v_cvt_i32_f32  v164, v164                            \n"
"  v_cvt_i32_f32  v165, v165                            \n"
"  v_cvt_i32_f32  v166, v166                            \n"
"  v_cvt_i32_f32  v167, v167                            \n"
"  v_perm_b32    v137, v165, v164, s53                  \n"
"  v_perm_b32    v137, v166, v137, s54                  \n"
"  v_perm_b32    v137, v167, v137, s55                  \n"
"  v_mul_f32     v168, v22, v168                        \n"
"  v_mul_f32     v169, v22, v169                        \n"
"  v_mul_f32     v170, v22, v170                        \n"
"  v_mul_f32     v171, v22, v171                        \n"
"  v_cvt_i32_f32  v168, v168                            \n"
"  v_cvt_i32_f32  v169, v169                            \n"
"  v_cvt_i32_f32  v170, v170                            \n"
"  v_cvt_i32_f32  v171, v171                            \n"
"  v_perm_b32    v138, v169, v168, s53                  \n"
"  v_perm_b32    v138, v170, v138, s54                  \n"
"  v_perm_b32    v138, v171, v138, s55                  \n"
"  v_mul_f32     v172, v23, v172                        \n"
"  v_mul_f32     v173, v23, v173                        \n"
"  v_mul_f32     v174, v23, v174                        \n"
"  v_mul_f32     v175, v23, v175                        \n"
"  v_cvt_i32_f32  v172, v172                            \n"
"  v_cvt_i32_f32  v173, v173                            \n"
"  v_cvt_i32_f32  v174, v174                            \n"
"  v_cvt_i32_f32  v175, v175                            \n"
"  v_perm_b32    v139, v173, v172, s53                  \n"
"  v_perm_b32    v139, v174, v139, s54                  \n"
"  v_perm_b32    v139, v175, v139, s55                  \n"
"  v_mul_f32     v176, v22, v176                        \n"
"  v_mul_f32     v177, v22, v177                        \n"
"  v_mul_f32     v178, v22, v178                        \n"
"  v_mul_f32     v179, v22, v179                        \n"
"  v_cvt_i32_f32  v176, v176                            \n"
"  v_cvt_i32_f32  v177, v177                            \n"
"  v_cvt_i32_f32  v178, v178                            \n"
"  v_cvt_i32_f32  v179, v179                            \n"
"  v_perm_b32    v140, v177, v176, s53                  \n"
"  v_perm_b32    v140, v178, v140, s54                  \n"
"  v_perm_b32    v140, v179, v140, s55                  \n"
"  v_mul_f32     v180, v23, v180                        \n"
"  v_mul_f32     v181, v23, v181                        \n"
"  v_mul_f32     v182, v23, v182                        \n"
"  v_mul_f32     v183, v23, v183                        \n"
"  v_cvt_i32_f32  v180, v180                            \n"
"  v_cvt_i32_f32  v181, v181                            \n"
"  v_cvt_i32_f32  v182, v182                            \n"
"  v_cvt_i32_f32  v183, v183                            \n"
"  v_perm_b32    v141, v181, v180, s53                  \n"
"  v_perm_b32    v141, v182, v141, s54                  \n"
"  v_perm_b32    v141, v183, v141, s55                  \n"
"  v_mul_f32     v184, v22, v184                        \n"
"  v_mul_f32     v185, v22, v185                        \n"
"  v_mul_f32     v186, v22, v186                        \n"
"  v_mul_f32     v187, v22, v187                        \n"
"  v_cvt_i32_f32  v184, v184                            \n"
"  v_cvt_i32_f32  v185, v185                            \n"
"  v_cvt_i32_f32  v186, v186                            \n"
"  v_cvt_i32_f32  v187, v187                            \n"
"  v_perm_b32    v142, v185, v184, s53                  \n"
"  v_perm_b32    v142, v186, v142, s54                  \n"
"  v_perm_b32    v142, v187, v142, s55                  \n"
"  v_mul_f32     v188, v23, v188                        \n"
"  v_mul_f32     v189, v23, v189                        \n"
"  v_mul_f32     v190, v23, v190                        \n"
"  v_mul_f32     v191, v23, v191                        \n"
"  v_cvt_i32_f32  v188, v188                            \n"
"  v_cvt_i32_f32  v189, v189                            \n"
"  v_cvt_i32_f32  v190, v190                            \n"
"  v_cvt_i32_f32  v191, v191                            \n"
"  v_perm_b32    v143, v189, v188, s53                  \n"
"  v_perm_b32    v143, v190, v143, s54                  \n"
"  v_perm_b32    v143, v191, v143, s55                  \n"
"  v_rcp_f32     v24, v22                               \n"
"  v_rcp_f32     v25, v23                               \n"
"  v_lshrrev_b32  v54, 5, v0                            \n"
"  v_lshlrev_b32  v55, 5, v54                           \n"
"  v_and_b32     v54, 31, v0                            \n"
"  v_lshrrev_b32  v56, 4, v54                           \n"
"  v_add_u32     v55, v56, v55                          \n"
"  v_and_b32     v54, 15, v0                            \n"
"  v_lshlrev_b32  v54, 1, v54                           \n"
"  v_add_u32     v55, v54, v55                          \n"
"  v_lshlrev_b32  v54, 2, v55                           \n"
"  s_mul_i32     s60, 0x00000100, s7                    \n"
"  v_add_u32     v54, v54, s60                          \n"
"  ds_write_b32  v54, v128 offset:18688                 \n"
"  ds_write_b32  v54, v129 offset:26880                 \n"
"  ds_write_b32  v54, v130 offset:19712                 \n"
"  ds_write_b32  v54, v131 offset:27904                 \n"
"  ds_write_b32  v54, v132 offset:20736                 \n"
"  ds_write_b32  v54, v133 offset:28928                 \n"
"  ds_write_b32  v54, v134 offset:21760                 \n"
"  ds_write_b32  v54, v135 offset:29952                 \n"
"  ds_write_b32  v54, v136 offset:22784                 \n"
"  ds_write_b32  v54, v137 offset:30976                 \n"
"  ds_write_b32  v54, v138 offset:23808                 \n"
"  ds_write_b32  v54, v139 offset:32000                 \n"
"  ds_write_b32  v54, v140 offset:24832                 \n"
"  ds_write_b32  v54, v141 offset:33024                 \n"
"  ds_write_b32  v54, v142 offset:25856                 \n"
"  ds_write_b32  v54, v143 offset:34048                 \n"
"  s_waitcnt     lgkmcnt(0)                             \n"
"  s_barrier                                            \n"
"  v_lshrrev_b32  v54, 4, v0                            \n"
"  v_lshlrev_b32  v55, 6, v54                           \n"
"  v_and_b32     v54, 15, v0                            \n"
"  v_lshlrev_b32  v54, 1, v54                           \n"
"  v_add_u32     v55, v54, v55                          \n"
"  v_lshlrev_b32  v54, 2, v55                           \n"
"  ds_read_b64   v[128:129], v54 offset:18688           \n"
"  ds_read_b64   v[130:131], v54 offset:18816           \n"
"  ds_read_b64   v[132:133], v54 offset:19712           \n"
"  ds_read_b64   v[134:135], v54 offset:19840           \n"
"  ds_read_b64   v[136:137], v54 offset:20736           \n"
"  ds_read_b64   v[138:139], v54 offset:20864           \n"
"  ds_read_b64   v[140:141], v54 offset:21760           \n"
"  ds_read_b64   v[142:143], v54 offset:21888           \n"
"  ds_read_b64   v[144:145], v54 offset:22784           \n"
"  ds_read_b64   v[146:147], v54 offset:22912           \n"
"  ds_read_b64   v[148:149], v54 offset:23808           \n"
"  ds_read_b64   v[150:151], v54 offset:23936           \n"
"  ds_read_b64   v[152:153], v54 offset:24832           \n"
"  ds_read_b64   v[154:155], v54 offset:24960           \n"
"  ds_read_b64   v[156:157], v54 offset:25856           \n"
"  ds_read_b64   v[158:159], v54 offset:25984           \n"
"  ds_read_b64   v[160:161], v54 offset:26880           \n"
"  ds_read_b64   v[162:163], v54 offset:27008           \n"
"  ds_read_b64   v[164:165], v54 offset:27904           \n"
"  ds_read_b64   v[166:167], v54 offset:28032           \n"
"  ds_read_b64   v[168:169], v54 offset:28928           \n"
"  ds_read_b64   v[170:171], v54 offset:29056           \n"
"  ds_read_b64   v[172:173], v54 offset:29952           \n"
"  ds_read_b64   v[174:175], v54 offset:30080           \n"
"  ds_read_b64   v[176:177], v54 offset:30976           \n"
"  ds_read_b64   v[178:179], v54 offset:31104           \n"
"  ds_read_b64   v[180:181], v54 offset:32000           \n"
"  ds_read_b64   v[182:183], v54 offset:32128           \n"
"  ds_read_b64   v[184:185], v54 offset:33024           \n"
"  ds_read_b64   v[186:187], v54 offset:33152           \n"
"  ds_read_b64   v[188:189], v54 offset:34048           \n"
"  ds_read_b64   v[190:191], v54 offset:34176           \n"
"  s_add_u32     s12, %[s_tile_os_b], s12                          \n"
"  s_addc_u32    s13, 0, s13                            \n"
"  s_add_u32     s16, %[s_tile_os_dq], s16                          \n"
"  s_addc_u32    s17, 0, s17                            \n"
"  s_mov_b32     s80, 0                                 \n"
"  s_waitcnt     0x0000                                 \n"
"label_0C3C:                   \n"
"  s_waitcnt     vmcnt(41)                              \n"
"  s_barrier                                            \n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[0:1], v[128:129], 0\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[2:3], v[130:131], v[192:195]\n"
"  buffer_load_dwordx4  acc[128:131], %[v_os_b0], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[4:5], v[132:133], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[6:7], v[134:135], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[8:9], v[136:137], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[10:11], v[138:139], v[192:195]\n"
"  buffer_load_dwordx4  acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[12:13], v[140:141], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[14:15], v[142:143], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[0:1], v[160:161], 0\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[2:3], v[162:163], v[196:199]\n"
"  buffer_load_dwordx4  acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[4:5], v[164:165], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[6:7], v[166:167], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[8:9], v[168:169], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[10:11], v[170:171], v[196:199]\n"
"  buffer_load_dwordx4  acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[12:13], v[172:173], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[14:15], v[174:175], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[16:17], v[128:129], 0\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[18:19], v[130:131], v[200:203]\n"
"  buffer_load_dwordx4  acc[144:147], %[v_os_b1], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[20:21], v[132:133], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[22:23], v[134:135], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[24:25], v[136:137], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[26:27], v[138:139], v[200:203]\n"
"  buffer_load_dwordx4  acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[28:29], v[140:141], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[30:31], v[142:143], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[16:17], v[160:161], 0\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[18:19], v[162:163], v[204:207]\n"
"  buffer_load_dwordx4  acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[20:21], v[164:165], v[204:207]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[22:23], v[166:167], v[204:207]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[24:25], v[168:169], v[204:207]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[26:27], v[170:171], v[204:207]\n"
"  buffer_load_dwordx4  acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[28:29], v[172:173], v[204:207]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[30:31], v[174:175], v[204:207]\n"
"  s_waitcnt     vmcnt(41)                              \n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[32:33], v[128:129], 0\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[34:35], v[130:131], v[208:211]\n"
"  buffer_load_dwordx4  acc[160:163], %[v_os_b2], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[36:37], v[132:133], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[38:39], v[134:135], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[40:41], v[136:137], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[42:43], v[138:139], v[208:211]\n"
"  buffer_load_dwordx4  acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[44:45], v[140:141], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[46:47], v[142:143], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[32:33], v[160:161], 0\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[34:35], v[162:163], v[212:215]\n"
"  buffer_load_dwordx4  acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[36:37], v[164:165], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[38:39], v[166:167], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[40:41], v[168:169], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[42:43], v[170:171], v[212:215]\n"
"  buffer_load_dwordx4  acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[44:45], v[172:173], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[46:47], v[174:175], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[48:49], v[128:129], 0\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[50:51], v[130:131], v[216:219]\n"
"  buffer_load_dwordx4  acc[176:179], %[v_os_b3], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[52:53], v[132:133], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[54:55], v[134:135], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[56:57], v[136:137], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[58:59], v[138:139], v[216:219]\n"
"  buffer_load_dwordx4  acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[60:61], v[140:141], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[62:63], v[142:143], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[48:49], v[160:161], 0\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[50:51], v[162:163], v[220:223]\n"
"  buffer_load_dwordx4  acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[52:53], v[164:165], v[220:223]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[54:55], v[166:167], v[220:223]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[56:57], v[168:169], v[220:223]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[58:59], v[170:171], v[220:223]\n"
"  buffer_load_dwordx4  acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
"  s_add_u32     s12, %[s_tile_os_b_half], s12                          \n"
"  s_addc_u32    s13, 0, s13                            \n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[60:61], v[172:173], v[220:223]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[62:63], v[174:175], v[220:223]\n"
"  s_waitcnt     vmcnt(41)                              \n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[64:65], v[144:145], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[66:67], v[146:147], v[192:195]\n"
"  buffer_load_dwordx4  acc[192:195], %[v_os_b0], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[68:69], v[148:149], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[70:71], v[150:151], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[72:73], v[152:153], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[74:75], v[154:155], v[192:195]\n"
"  buffer_load_dwordx4  acc[196:199], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[76:77], v[156:157], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[192:195], acc[78:79], v[158:159], v[192:195]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[64:65], v[176:177], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[66:67], v[178:179], v[196:199]\n"
"  buffer_load_dwordx4  acc[200:203], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[68:69], v[180:181], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[70:71], v[182:183], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[72:73], v[184:185], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[74:75], v[186:187], v[196:199]\n"
"  buffer_load_dwordx4  acc[204:207], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[76:77], v[188:189], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[196:199], acc[78:79], v[190:191], v[196:199]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[80:81], v[144:145], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[82:83], v[146:147], v[200:203]\n"
"  buffer_load_dwordx4  acc[208:211], %[v_os_b1], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[84:85], v[148:149], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[86:87], v[150:151], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[88:89], v[152:153], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[90:91], v[154:155], v[200:203]\n"
"  buffer_load_dwordx4  acc[212:215], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[92:93], v[156:157], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[200:203], acc[94:95], v[158:159], v[200:203]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[80:81], v[176:177], v[204:207]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[82:83], v[178:179], v[204:207]\n"
"  buffer_load_dwordx4  acc[216:219], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[84:85], v[180:181], v[204:207]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[86:87], v[182:183], v[204:207]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[88:89], v[184:185], v[204:207]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[90:91], v[186:187], v[204:207]\n"
"  buffer_load_dwordx4  acc[220:223], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[92:93], v[188:189], v[204:207]\n"
"  v_mfma_i32_16x16x32_i8  v[204:207], acc[94:95], v[190:191], v[204:207]\n"
"  s_waitcnt     vmcnt(40)                              \n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[96:97], v[144:145], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[98:99], v[146:147], v[208:211]\n"
"  buffer_load_dwordx4  acc[224:227], %[v_os_b2], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[100:101], v[148:149], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[102:103], v[150:151], v[208:211]\n"
"  buffer_load_dword  v13, v5, s[16:19], 0 offen        \n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[104:105], v[152:153], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[106:107], v[154:155], v[208:211]\n"
"  buffer_load_dwordx4  acc[228:231], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[108:109], v[156:157], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[208:211], acc[110:111], v[158:159], v[208:211]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[96:97], v[176:177], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[98:99], v[178:179], v[212:215]\n"
"  buffer_load_dwordx4  acc[232:235], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[100:101], v[180:181], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[102:103], v[182:183], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[104:105], v[184:185], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[106:107], v[186:187], v[212:215]\n"
"  buffer_load_dwordx4  acc[236:239], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[108:109], v[188:189], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[212:215], acc[110:111], v[190:191], v[212:215]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[112:113], v[144:145], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[114:115], v[146:147], v[216:219]\n"
"  buffer_load_dwordx4  acc[240:243], %[v_os_b3], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[116:117], v[148:149], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[118:119], v[150:151], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[120:121], v[152:153], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[122:123], v[154:155], v[216:219]\n"
"  buffer_load_dwordx4  acc[244:247], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[124:125], v[156:157], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[216:219], acc[126:127], v[158:159], v[216:219]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[112:113], v[176:177], v[220:223]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[114:115], v[178:179], v[220:223]\n"
"  buffer_load_dwordx4  acc[248:251], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[116:117], v[180:181], v[220:223]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[118:119], v[182:183], v[220:223]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[120:121], v[184:185], v[220:223]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[122:123], v[186:187], v[220:223]\n"
"  buffer_load_dwordx4  acc[252:255], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[124:125], v[188:189], v[220:223]\n"
"  v_mfma_i32_16x16x32_i8  v[220:223], acc[126:127], v[190:191], v[220:223]\n"
"  s_add_u32     s60, 0x00000200, s80                   \n"
"  s_cmp_lt_u32  s60, s81                               \n"
"  s_cselect_b32  %[s_tile_os_b], %[s_tile_os_b], 0                           \n"
"  s_cselect_b32  %[s_tile_os_b_half], %[s_tile_os_b_half], 0                           \n"
"  s_cselect_b32  %[s_tile_os_dq], %[s_tile_os_dq], 0                           \n"
"  s_add_u32     s12, %[s_tile_os_b], s12                          \n"
"  s_addc_u32    s13, 0, s13                            \n"
"  s_add_u32     s16, %[s_tile_os_dq], s16                          \n"
"  s_addc_u32    s17, 0, s17                            \n"
"  v_cvt_f32_i32  v192, v192                            \n"
"  v_cvt_f32_i32  v193, v193                            \n"
"  v_cvt_f32_i32  v194, v194                            \n"
"  v_cvt_f32_i32  v195, v195                            \n"
"  v_mul_f32     v192, v24, v192                        \n"
"  v_mul_f32     v193, v24, v193                        \n"
"  v_mul_f32     v194, v24, v194                        \n"
"  v_mul_f32     v195, v24, v195                        \n"
"  v_mul_f32     v192, v12, v192 row_newbcast:0         \n"
"  v_mul_f32     v193, v12, v193 row_newbcast:1         \n"
"  v_mul_f32     v194, v12, v194 row_newbcast:2         \n"
"  v_mul_f32     v195, v12, v195 row_newbcast:3         \n"
"  v_mul_f32     v192, v20, v192                        \n"
"  v_mul_f32     v193, v20, v193                        \n"
"  v_mul_f32     v194, v20, v194                        \n"
"  v_mul_f32     v195, v20, v195                        \n"
"  v_cvt_f32_i32  v196, v196                            \n"
"  v_cvt_f32_i32  v197, v197                            \n"
"  v_cvt_f32_i32  v198, v198                            \n"
"  v_cvt_f32_i32  v199, v199                            \n"
"  v_mul_f32     v196, v25, v196                        \n"
"  v_mul_f32     v197, v25, v197                        \n"
"  v_mul_f32     v198, v25, v198                        \n"
"  v_mul_f32     v199, v25, v199                        \n"
"  v_mul_f32     v196, v12, v196 row_newbcast:0         \n"
"  v_mul_f32     v197, v12, v197 row_newbcast:1         \n"
"  v_mul_f32     v198, v12, v198 row_newbcast:2         \n"
"  v_mul_f32     v199, v12, v199 row_newbcast:3         \n"
"  v_mul_f32     v196, v21, v196                        \n"
"  v_mul_f32     v197, v21, v197                        \n"
"  v_mul_f32     v198, v21, v198                        \n"
"  v_mul_f32     v199, v21, v199                        \n"
"  v_cvt_f32_i32  v200, v200                            \n"
"  v_cvt_f32_i32  v201, v201                            \n"
"  v_cvt_f32_i32  v202, v202                            \n"
"  v_cvt_f32_i32  v203, v203                            \n"
"  v_mul_f32     v200, v24, v200                        \n"
"  v_mul_f32     v201, v24, v201                        \n"
"  v_mul_f32     v202, v24, v202                        \n"
"  v_mul_f32     v203, v24, v203                        \n"
"  v_mul_f32     v200, v12, v200 row_newbcast:4         \n"
"  v_mul_f32     v201, v12, v201 row_newbcast:5         \n"
"  v_mul_f32     v202, v12, v202 row_newbcast:6         \n"
"  v_mul_f32     v203, v12, v203 row_newbcast:7         \n"
"  v_mul_f32     v200, v20, v200                        \n"
"  v_mul_f32     v201, v20, v201                        \n"
"  v_mul_f32     v202, v20, v202                        \n"
"  v_mul_f32     v203, v20, v203                        \n"
"  v_cvt_f32_i32  v204, v204                            \n"
"  v_cvt_f32_i32  v205, v205                            \n"
"  v_cvt_f32_i32  v206, v206                            \n"
"  v_cvt_f32_i32  v207, v207                            \n"
"  v_mul_f32     v204, v25, v204                        \n"
"  v_mul_f32     v205, v25, v205                        \n"
"  v_mul_f32     v206, v25, v206                        \n"
"  v_mul_f32     v207, v25, v207                        \n"
"  v_mul_f32     v204, v12, v204 row_newbcast:4         \n"
"  v_mul_f32     v205, v12, v205 row_newbcast:5         \n"
"  v_mul_f32     v206, v12, v206 row_newbcast:6         \n"
"  v_mul_f32     v207, v12, v207 row_newbcast:7         \n"
"  v_mul_f32     v204, v21, v204                        \n"
"  v_mul_f32     v205, v21, v205                        \n"
"  v_mul_f32     v206, v21, v206                        \n"
"  v_mul_f32     v207, v21, v207                        \n"
"  v_cvt_f32_i32  v208, v208                            \n"
"  v_cvt_f32_i32  v209, v209                            \n"
"  v_cvt_f32_i32  v210, v210                            \n"
"  v_cvt_f32_i32  v211, v211                            \n"
"  v_mul_f32     v208, v24, v208                        \n"
"  v_mul_f32     v209, v24, v209                        \n"
"  v_mul_f32     v210, v24, v210                        \n"
"  v_mul_f32     v211, v24, v211                        \n"
"  v_mul_f32     v208, v12, v208 row_newbcast:8         \n"
"  v_mul_f32     v209, v12, v209 row_newbcast:9         \n"
"  v_mul_f32     v210, v12, v210 row_newbcast:10        \n"
"  v_mul_f32     v211, v12, v211 row_newbcast:11        \n"
"  v_mul_f32     v208, v20, v208                        \n"
"  v_mul_f32     v209, v20, v209                        \n"
"  v_mul_f32     v210, v20, v210                        \n"
"  v_mul_f32     v211, v20, v211                        \n"
"  v_cvt_f32_i32  v212, v212                            \n"
"  v_cvt_f32_i32  v213, v213                            \n"
"  v_cvt_f32_i32  v214, v214                            \n"
"  v_cvt_f32_i32  v215, v215                            \n"
"  v_mul_f32     v212, v25, v212                        \n"
"  v_mul_f32     v213, v25, v213                        \n"
"  v_mul_f32     v214, v25, v214                        \n"
"  v_mul_f32     v215, v25, v215                        \n"
"  v_mul_f32     v212, v12, v212 row_newbcast:8         \n"
"  v_mul_f32     v213, v12, v213 row_newbcast:9         \n"
"  v_mul_f32     v214, v12, v214 row_newbcast:10        \n"
"  v_mul_f32     v215, v12, v215 row_newbcast:11        \n"
"  v_mul_f32     v212, v21, v212                        \n"
"  v_mul_f32     v213, v21, v213                        \n"
"  v_mul_f32     v214, v21, v214                        \n"
"  v_mul_f32     v215, v21, v215                        \n"
"  v_cvt_f32_i32  v216, v216                            \n"
"  v_cvt_f32_i32  v217, v217                            \n"
"  v_cvt_f32_i32  v218, v218                            \n"
"  v_cvt_f32_i32  v219, v219                            \n"
"  v_mul_f32     v216, v24, v216                        \n"
"  v_mul_f32     v217, v24, v217                        \n"
"  v_mul_f32     v218, v24, v218                        \n"
"  v_mul_f32     v219, v24, v219                        \n"
"  v_mul_f32     v216, v12, v216 row_newbcast:12        \n"
"  v_mul_f32     v217, v12, v217 row_newbcast:13        \n"
"  v_mul_f32     v218, v12, v218 row_newbcast:14        \n"
"  v_mul_f32     v219, v12, v219 row_newbcast:15        \n"
"  v_mul_f32     v216, v20, v216                        \n"
"  v_mul_f32     v217, v20, v217                        \n"
"  v_mul_f32     v218, v20, v218                        \n"
"  v_mul_f32     v219, v20, v219                        \n"
"  v_cvt_f32_i32  v220, v220                            \n"
"  v_cvt_f32_i32  v221, v221                            \n"
"  v_cvt_f32_i32  v222, v222                            \n"
"  v_cvt_f32_i32  v223, v223                            \n"
"  v_mul_f32     v220, v25, v220                        \n"
"  v_mul_f32     v221, v25, v221                        \n"
"  v_mul_f32     v222, v25, v222                        \n"
"  v_mul_f32     v223, v25, v223                        \n"
"  v_mul_f32     v220, v12, v220 row_newbcast:12        \n"
"  v_mul_f32     v221, v12, v221 row_newbcast:13        \n"
"  v_mul_f32     v222, v12, v222 row_newbcast:14        \n"
"  v_mul_f32     v223, v12, v223 row_newbcast:15        \n"
"  v_mul_f32     v220, v21, v220                        \n"
"  v_mul_f32     v221, v21, v221                        \n"
"  v_mul_f32     v222, v21, v222                        \n"
"  v_mul_f32     v223, v21, v223                        \n"
"  v_cmp_u_f32   s[48:49], v192, v192                   \n"
"  v_add3_u32    v50, v192, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v193, v193                   \n"
"  v_add3_u32    v50, v193, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v192, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v194, v194                   \n"
"  v_add3_u32    v50, v194, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v195, v195                   \n"
"  v_add3_u32    v50, v195, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v193, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v196, v196                   \n"
"  v_add3_u32    v50, v196, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v197, v197                   \n"
"  v_add3_u32    v50, v197, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v194, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v198, v198                   \n"
"  v_add3_u32    v50, v198, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v199, v199                   \n"
"  v_add3_u32    v50, v199, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v195, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v200, v200                   \n"
"  v_add3_u32    v50, v200, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v201, v201                   \n"
"  v_add3_u32    v50, v201, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v196, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v202, v202                   \n"
"  v_add3_u32    v50, v202, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v203, v203                   \n"
"  v_add3_u32    v50, v203, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v197, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v204, v204                   \n"
"  v_add3_u32    v50, v204, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v205, v205                   \n"
"  v_add3_u32    v50, v205, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v198, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v206, v206                   \n"
"  v_add3_u32    v50, v206, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v207, v207                   \n"
"  v_add3_u32    v50, v207, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v199, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v208, v208                   \n"
"  v_add3_u32    v50, v208, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v209, v209                   \n"
"  v_add3_u32    v50, v209, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v200, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v210, v210                   \n"
"  v_add3_u32    v50, v210, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v211, v211                   \n"
"  v_add3_u32    v50, v211, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v201, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v212, v212                   \n"
"  v_add3_u32    v50, v212, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v213, v213                   \n"
"  v_add3_u32    v50, v213, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v202, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v214, v214                   \n"
"  v_add3_u32    v50, v214, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v215, v215                   \n"
"  v_add3_u32    v50, v215, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v203, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v216, v216                   \n"
"  v_add3_u32    v50, v216, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v217, v217                   \n"
"  v_add3_u32    v50, v217, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v204, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v218, v218                   \n"
"  v_add3_u32    v50, v218, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v219, v219                   \n"
"  v_add3_u32    v50, v219, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v205, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v220, v220                   \n"
"  v_add3_u32    v50, v220, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v221, v221                   \n"
"  v_add3_u32    v50, v221, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v206, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v222, v222                   \n"
"  v_add3_u32    v50, v222, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v223, v223                   \n"
"  v_add3_u32    v50, v223, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v207, v55, v54, s52                    \n"
"  ds_write_b64  v3, v[192:193] offset:35072            \n"
"  ds_write_b64  v3, v[194:195] offset:43776            \n"
"  ds_write_b64  v3, v[196:197] offset:37248            \n"
"  ds_write_b64  v3, v[198:199] offset:45952            \n"
"  ds_write_b64  v3, v[200:201] offset:39424            \n"
"  ds_write_b64  v3, v[202:203] offset:48128            \n"
"  ds_write_b64  v3, v[204:205] offset:41600            \n"
"  ds_write_b64  v3, v[206:207] offset:50304            \n"
"  s_waitcnt     lgkmcnt(0)                             \n"
"  s_barrier                                            \n"
"  ds_read_b32   v64, v4 offset:35072                   \n"
"  ds_read_b32   v65, v4 offset:39424                   \n"
"  ds_read_b32   v66, v4 offset:35104                   \n"
"  ds_read_b32   v67, v4 offset:39456                   \n"
"  ds_read_b32   v68, v4 offset:35136                   \n"
"  ds_read_b32   v69, v4 offset:39488                   \n"
"  ds_read_b32   v70, v4 offset:35168                   \n"
"  ds_read_b32   v71, v4 offset:39520                   \n"
"  ds_read_b32   v72, v4 offset:43776                   \n"
"  ds_read_b32   v73, v4 offset:48128                   \n"
"  ds_read_b32   v74, v4 offset:43808                   \n"
"  ds_read_b32   v75, v4 offset:48160                   \n"
"  ds_read_b32   v76, v4 offset:43840                   \n"
"  ds_read_b32   v77, v4 offset:48192                   \n"
"  ds_read_b32   v78, v4 offset:43872                   \n"
"  ds_read_b32   v79, v4 offset:48224                   \n"
"  s_waitcnt     lgkmcnt(0)                             \n"
"  s_mov_b64     exec, s[20:21]                         \n"
"  global_atomic_pk_add_bf16   v80, v64, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[20:21]                         \n"
"  global_atomic_pk_add_bf16   v80, v65, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[22:23]                         \n"
"  global_atomic_pk_add_bf16   v82, v66, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[22:23]                         \n"
"  global_atomic_pk_add_bf16   v82, v67, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[24:25]                         \n"
"  global_atomic_pk_add_bf16   v84, v68, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[24:25]                         \n"
"  global_atomic_pk_add_bf16   v84, v69, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[26:27]                         \n"
"  global_atomic_pk_add_bf16   v86, v70, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[26:27]                         \n"
"  global_atomic_pk_add_bf16   v86, v71, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[28:29]                         \n"
"  global_atomic_pk_add_bf16   v88, v72, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[28:29]                         \n"
"  global_atomic_pk_add_bf16   v88, v73, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[30:31]                         \n"
"  global_atomic_pk_add_bf16   v90, v74, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[30:31]                         \n"
"  global_atomic_pk_add_bf16   v90, v75, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[32:33]                         \n"
"  global_atomic_pk_add_bf16   v92, v76, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[32:33]                         \n"
"  global_atomic_pk_add_bf16   v92, v77, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[34:35]                         \n"
"  global_atomic_pk_add_bf16   v94, v78, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[34:35]                         \n"
"  global_atomic_pk_add_bf16   v94, v79, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_add_u32     s8, s59, s8                            \n"
"  s_addc_u32    s9, 0, s9                              \n"
"  s_addk_i32    s80, 0x0100                            \n"
"  s_cmp_lt_i32  s80, s81                               \n"
"  s_cbranch_scc0  label_2301                           \n"
"  s_waitcnt     vmcnt(41)                              \n"
"  s_barrier                                            \n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[128:129], v[128:129], 0\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[130:131], v[130:131], v[224:227]\n"
"  buffer_load_dwordx4  acc[0:3], %[v_os_b0], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[132:133], v[132:133], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[134:135], v[134:135], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[136:137], v[136:137], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[138:139], v[138:139], v[224:227]\n"
"  buffer_load_dwordx4  acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[140:141], v[140:141], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[142:143], v[142:143], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[128:129], v[160:161], 0\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[130:131], v[162:163], v[228:231]\n"
"  buffer_load_dwordx4  acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[132:133], v[164:165], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[134:135], v[166:167], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[136:137], v[168:169], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[138:139], v[170:171], v[228:231]\n"
"  buffer_load_dwordx4  acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[140:141], v[172:173], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[142:143], v[174:175], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[144:145], v[128:129], 0\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[146:147], v[130:131], v[232:235]\n"
"  buffer_load_dwordx4  acc[16:19], %[v_os_b1], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[148:149], v[132:133], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[150:151], v[134:135], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[152:153], v[136:137], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[154:155], v[138:139], v[232:235]\n"
"  buffer_load_dwordx4  acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[156:157], v[140:141], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[158:159], v[142:143], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[144:145], v[160:161], 0\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[146:147], v[162:163], v[236:239]\n"
"  buffer_load_dwordx4  acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[148:149], v[164:165], v[236:239]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[150:151], v[166:167], v[236:239]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[152:153], v[168:169], v[236:239]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[154:155], v[170:171], v[236:239]\n"
"  buffer_load_dwordx4  acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[156:157], v[172:173], v[236:239]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[158:159], v[174:175], v[236:239]\n"
"  s_waitcnt     vmcnt(41)                              \n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[160:161], v[128:129], 0\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[162:163], v[130:131], v[240:243]\n"
"  buffer_load_dwordx4  acc[32:35], %[v_os_b2], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[164:165], v[132:133], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[166:167], v[134:135], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[168:169], v[136:137], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[170:171], v[138:139], v[240:243]\n"
"  buffer_load_dwordx4  acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[172:173], v[140:141], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[174:175], v[142:143], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[160:161], v[160:161], 0\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[162:163], v[162:163], v[244:247]\n"
"  buffer_load_dwordx4  acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[164:165], v[164:165], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[166:167], v[166:167], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[168:169], v[168:169], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[170:171], v[170:171], v[244:247]\n"
"  buffer_load_dwordx4  acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[172:173], v[172:173], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[174:175], v[174:175], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[176:177], v[128:129], 0\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[178:179], v[130:131], v[248:251]\n"
"  buffer_load_dwordx4  acc[48:51], %[v_os_b3], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[180:181], v[132:133], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[182:183], v[134:135], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[184:185], v[136:137], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[186:187], v[138:139], v[248:251]\n"
"  buffer_load_dwordx4  acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[188:189], v[140:141], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[190:191], v[142:143], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[176:177], v[160:161], 0\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[178:179], v[162:163], v[252:255]\n"
"  buffer_load_dwordx4  acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[180:181], v[164:165], v[252:255]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[182:183], v[166:167], v[252:255]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[184:185], v[168:169], v[252:255]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[186:187], v[170:171], v[252:255]\n"
"  buffer_load_dwordx4  acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
"  s_add_u32     s12, %[s_tile_os_b_half], s12                          \n"
"  s_addc_u32    s13, 0, s13                            \n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[188:189], v[172:173], v[252:255]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[190:191], v[174:175], v[252:255]\n"
"  s_waitcnt     vmcnt(41)                              \n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[192:193], v[144:145], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[194:195], v[146:147], v[224:227]\n"
"  buffer_load_dwordx4  acc[64:67], %[v_os_b0], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[196:197], v[148:149], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[198:199], v[150:151], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[200:201], v[152:153], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[202:203], v[154:155], v[224:227]\n"
"  buffer_load_dwordx4  acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[204:205], v[156:157], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[224:227], acc[206:207], v[158:159], v[224:227]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[192:193], v[176:177], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[194:195], v[178:179], v[228:231]\n"
"  buffer_load_dwordx4  acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[196:197], v[180:181], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[198:199], v[182:183], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[200:201], v[184:185], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[202:203], v[186:187], v[228:231]\n"
"  buffer_load_dwordx4  acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[204:205], v[188:189], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[228:231], acc[206:207], v[190:191], v[228:231]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[208:209], v[144:145], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[210:211], v[146:147], v[232:235]\n"
"  buffer_load_dwordx4  acc[80:83], %[v_os_b1], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[212:213], v[148:149], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[214:215], v[150:151], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[216:217], v[152:153], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[218:219], v[154:155], v[232:235]\n"
"  buffer_load_dwordx4  acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[220:221], v[156:157], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[232:235], acc[222:223], v[158:159], v[232:235]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[208:209], v[176:177], v[236:239]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[210:211], v[178:179], v[236:239]\n"
"  buffer_load_dwordx4  acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[212:213], v[180:181], v[236:239]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[214:215], v[182:183], v[236:239]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[216:217], v[184:185], v[236:239]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[218:219], v[186:187], v[236:239]\n"
"  buffer_load_dwordx4  acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[220:221], v[188:189], v[236:239]\n"
"  v_mfma_i32_16x16x32_i8  v[236:239], acc[222:223], v[190:191], v[236:239]\n"
"  s_waitcnt     vmcnt(40)                              \n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[224:225], v[144:145], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[226:227], v[146:147], v[240:243]\n"
"  buffer_load_dwordx4  acc[96:99], %[v_os_b2], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[228:229], v[148:149], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[230:231], v[150:151], v[240:243]\n"
"  buffer_load_dword  v12, v5, s[16:19], 0 offen        \n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[232:233], v[152:153], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[234:235], v[154:155], v[240:243]\n"
"  buffer_load_dwordx4  acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[236:237], v[156:157], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[240:243], acc[238:239], v[158:159], v[240:243]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[224:225], v[176:177], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[226:227], v[178:179], v[244:247]\n"
"  buffer_load_dwordx4  acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[228:229], v[180:181], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[230:231], v[182:183], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[232:233], v[184:185], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[234:235], v[186:187], v[244:247]\n"
"  buffer_load_dwordx4  acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[236:237], v[188:189], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[244:247], acc[238:239], v[190:191], v[244:247]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[240:241], v[144:145], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[242:243], v[146:147], v[248:251]\n"
"  buffer_load_dwordx4  acc[112:115], %[v_os_b3], s[12:15], 0 offen\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[244:245], v[148:149], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[246:247], v[150:151], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[248:249], v[152:153], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[250:251], v[154:155], v[248:251]\n"
"  buffer_load_dwordx4  acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[252:253], v[156:157], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[248:251], acc[254:255], v[158:159], v[248:251]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[240:241], v[176:177], v[252:255]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[242:243], v[178:179], v[252:255]\n"
"  buffer_load_dwordx4  acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[244:245], v[180:181], v[252:255]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[246:247], v[182:183], v[252:255]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[248:249], v[184:185], v[252:255]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[250:251], v[186:187], v[252:255]\n"
"  buffer_load_dwordx4  acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[252:253], v[188:189], v[252:255]\n"
"  v_mfma_i32_16x16x32_i8  v[252:255], acc[254:255], v[190:191], v[252:255]\n"
"  s_add_u32     s60, 0x00000200, s80                   \n"
"  s_cmp_lt_u32  s60, s81                               \n"
"  s_cselect_b32  %[s_tile_os_b], %[s_tile_os_b], 0                           \n"
"  s_cselect_b32  %[s_tile_os_b_half], %[s_tile_os_b_half], 0                           \n"
"  s_cselect_b32  %[s_tile_os_dq], %[s_tile_os_dq], 0                           \n"
"  s_add_u32     s12, %[s_tile_os_b], s12                          \n"
"  s_addc_u32    s13, 0, s13                            \n"
"  s_add_u32     s16, %[s_tile_os_dq], s16                          \n"
"  s_addc_u32    s17, 0, s17                            \n"
"  v_cvt_f32_i32  v224, v224                            \n"
"  v_cvt_f32_i32  v225, v225                            \n"
"  v_cvt_f32_i32  v226, v226                            \n"
"  v_cvt_f32_i32  v227, v227                            \n"
"  v_mul_f32     v224, v24, v224                        \n"
"  v_mul_f32     v225, v24, v225                        \n"
"  v_mul_f32     v226, v24, v226                        \n"
"  v_mul_f32     v227, v24, v227                        \n"
"  v_mul_f32     v224, v13, v224 row_newbcast:0         \n"
"  v_mul_f32     v225, v13, v225 row_newbcast:1         \n"
"  v_mul_f32     v226, v13, v226 row_newbcast:2         \n"
"  v_mul_f32     v227, v13, v227 row_newbcast:3         \n"
"  v_mul_f32     v224, v20, v224                        \n"
"  v_mul_f32     v225, v20, v225                        \n"
"  v_mul_f32     v226, v20, v226                        \n"
"  v_mul_f32     v227, v20, v227                        \n"
"  v_cvt_f32_i32  v228, v228                            \n"
"  v_cvt_f32_i32  v229, v229                            \n"
"  v_cvt_f32_i32  v230, v230                            \n"
"  v_cvt_f32_i32  v231, v231                            \n"
"  v_mul_f32     v228, v25, v228                        \n"
"  v_mul_f32     v229, v25, v229                        \n"
"  v_mul_f32     v230, v25, v230                        \n"
"  v_mul_f32     v231, v25, v231                        \n"
"  v_mul_f32     v228, v13, v228 row_newbcast:0         \n"
"  v_mul_f32     v229, v13, v229 row_newbcast:1         \n"
"  v_mul_f32     v230, v13, v230 row_newbcast:2         \n"
"  v_mul_f32     v231, v13, v231 row_newbcast:3         \n"
"  v_mul_f32     v228, v21, v228                        \n"
"  v_mul_f32     v229, v21, v229                        \n"
"  v_mul_f32     v230, v21, v230                        \n"
"  v_mul_f32     v231, v21, v231                        \n"
"  v_cvt_f32_i32  v232, v232                            \n"
"  v_cvt_f32_i32  v233, v233                            \n"
"  v_cvt_f32_i32  v234, v234                            \n"
"  v_cvt_f32_i32  v235, v235                            \n"
"  v_mul_f32     v232, v24, v232                        \n"
"  v_mul_f32     v233, v24, v233                        \n"
"  v_mul_f32     v234, v24, v234                        \n"
"  v_mul_f32     v235, v24, v235                        \n"
"  v_mul_f32     v232, v13, v232 row_newbcast:4         \n"
"  v_mul_f32     v233, v13, v233 row_newbcast:5         \n"
"  v_mul_f32     v234, v13, v234 row_newbcast:6         \n"
"  v_mul_f32     v235, v13, v235 row_newbcast:7         \n"
"  v_mul_f32     v232, v20, v232                        \n"
"  v_mul_f32     v233, v20, v233                        \n"
"  v_mul_f32     v234, v20, v234                        \n"
"  v_mul_f32     v235, v20, v235                        \n"
"  v_cvt_f32_i32  v236, v236                            \n"
"  v_cvt_f32_i32  v237, v237                            \n"
"  v_cvt_f32_i32  v238, v238                            \n"
"  v_cvt_f32_i32  v239, v239                            \n"
"  v_mul_f32     v236, v25, v236                        \n"
"  v_mul_f32     v237, v25, v237                        \n"
"  v_mul_f32     v238, v25, v238                        \n"
"  v_mul_f32     v239, v25, v239                        \n"
"  v_mul_f32     v236, v13, v236 row_newbcast:4         \n"
"  v_mul_f32     v237, v13, v237 row_newbcast:5         \n"
"  v_mul_f32     v238, v13, v238 row_newbcast:6         \n"
"  v_mul_f32     v239, v13, v239 row_newbcast:7         \n"
"  v_mul_f32     v236, v21, v236                        \n"
"  v_mul_f32     v237, v21, v237                        \n"
"  v_mul_f32     v238, v21, v238                        \n"
"  v_mul_f32     v239, v21, v239                        \n"
"  v_cvt_f32_i32  v240, v240                            \n"
"  v_cvt_f32_i32  v241, v241                            \n"
"  v_cvt_f32_i32  v242, v242                            \n"
"  v_cvt_f32_i32  v243, v243                            \n"
"  v_mul_f32     v240, v24, v240                        \n"
"  v_mul_f32     v241, v24, v241                        \n"
"  v_mul_f32     v242, v24, v242                        \n"
"  v_mul_f32     v243, v24, v243                        \n"
"  v_mul_f32     v240, v13, v240 row_newbcast:8         \n"
"  v_mul_f32     v241, v13, v241 row_newbcast:9         \n"
"  v_mul_f32     v242, v13, v242 row_newbcast:10        \n"
"  v_mul_f32     v243, v13, v243 row_newbcast:11        \n"
"  v_mul_f32     v240, v20, v240                        \n"
"  v_mul_f32     v241, v20, v241                        \n"
"  v_mul_f32     v242, v20, v242                        \n"
"  v_mul_f32     v243, v20, v243                        \n"
"  v_cvt_f32_i32  v244, v244                            \n"
"  v_cvt_f32_i32  v245, v245                            \n"
"  v_cvt_f32_i32  v246, v246                            \n"
"  v_cvt_f32_i32  v247, v247                            \n"
"  v_mul_f32     v244, v25, v244                        \n"
"  v_mul_f32     v245, v25, v245                        \n"
"  v_mul_f32     v246, v25, v246                        \n"
"  v_mul_f32     v247, v25, v247                        \n"
"  v_mul_f32     v244, v13, v244 row_newbcast:8         \n"
"  v_mul_f32     v245, v13, v245 row_newbcast:9         \n"
"  v_mul_f32     v246, v13, v246 row_newbcast:10        \n"
"  v_mul_f32     v247, v13, v247 row_newbcast:11        \n"
"  v_mul_f32     v244, v21, v244                        \n"
"  v_mul_f32     v245, v21, v245                        \n"
"  v_mul_f32     v246, v21, v246                        \n"
"  v_mul_f32     v247, v21, v247                        \n"
"  v_cvt_f32_i32  v248, v248                            \n"
"  v_cvt_f32_i32  v249, v249                            \n"
"  v_cvt_f32_i32  v250, v250                            \n"
"  v_cvt_f32_i32  v251, v251                            \n"
"  v_mul_f32     v248, v24, v248                        \n"
"  v_mul_f32     v249, v24, v249                        \n"
"  v_mul_f32     v250, v24, v250                        \n"
"  v_mul_f32     v251, v24, v251                        \n"
"  v_mul_f32     v248, v13, v248 row_newbcast:12        \n"
"  v_mul_f32     v249, v13, v249 row_newbcast:13        \n"
"  v_mul_f32     v250, v13, v250 row_newbcast:14        \n"
"  v_mul_f32     v251, v13, v251 row_newbcast:15        \n"
"  v_mul_f32     v248, v20, v248                        \n"
"  v_mul_f32     v249, v20, v249                        \n"
"  v_mul_f32     v250, v20, v250                        \n"
"  v_mul_f32     v251, v20, v251                        \n"
"  v_cvt_f32_i32  v252, v252                            \n"
"  v_cvt_f32_i32  v253, v253                            \n"
"  v_cvt_f32_i32  v254, v254                            \n"
"  v_cvt_f32_i32  v255, v255                            \n"
"  v_mul_f32     v252, v25, v252                        \n"
"  v_mul_f32     v253, v25, v253                        \n"
"  v_mul_f32     v254, v25, v254                        \n"
"  v_mul_f32     v255, v25, v255                        \n"
"  v_mul_f32     v252, v13, v252 row_newbcast:12        \n"
"  v_mul_f32     v253, v13, v253 row_newbcast:13        \n"
"  v_mul_f32     v254, v13, v254 row_newbcast:14        \n"
"  v_mul_f32     v255, v13, v255 row_newbcast:15        \n"
"  v_mul_f32     v252, v21, v252                        \n"
"  v_mul_f32     v253, v21, v253                        \n"
"  v_mul_f32     v254, v21, v254                        \n"
"  v_mul_f32     v255, v21, v255                        \n"
"  v_cmp_u_f32   s[48:49], v224, v224                   \n"
"  v_add3_u32    v50, v224, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v225, v225                   \n"
"  v_add3_u32    v50, v225, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v224, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v226, v226                   \n"
"  v_add3_u32    v50, v226, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v227, v227                   \n"
"  v_add3_u32    v50, v227, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v225, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v228, v228                   \n"
"  v_add3_u32    v50, v228, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v229, v229                   \n"
"  v_add3_u32    v50, v229, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v226, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v230, v230                   \n"
"  v_add3_u32    v50, v230, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v231, v231                   \n"
"  v_add3_u32    v50, v231, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v227, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v232, v232                   \n"
"  v_add3_u32    v50, v232, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v233, v233                   \n"
"  v_add3_u32    v50, v233, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v228, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v234, v234                   \n"
"  v_add3_u32    v50, v234, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v235, v235                   \n"
"  v_add3_u32    v50, v235, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v229, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v236, v236                   \n"
"  v_add3_u32    v50, v236, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v237, v237                   \n"
"  v_add3_u32    v50, v237, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v230, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v238, v238                   \n"
"  v_add3_u32    v50, v238, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v239, v239                   \n"
"  v_add3_u32    v50, v239, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v231, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v240, v240                   \n"
"  v_add3_u32    v50, v240, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v241, v241                   \n"
"  v_add3_u32    v50, v241, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v232, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v242, v242                   \n"
"  v_add3_u32    v50, v242, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v243, v243                   \n"
"  v_add3_u32    v50, v243, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v233, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v244, v244                   \n"
"  v_add3_u32    v50, v244, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v245, v245                   \n"
"  v_add3_u32    v50, v245, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v234, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v246, v246                   \n"
"  v_add3_u32    v50, v246, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v247, v247                   \n"
"  v_add3_u32    v50, v247, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v235, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v248, v248                   \n"
"  v_add3_u32    v50, v248, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v249, v249                   \n"
"  v_add3_u32    v50, v249, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v236, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v250, v250                   \n"
"  v_add3_u32    v50, v250, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v251, v251                   \n"
"  v_add3_u32    v50, v251, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v237, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v252, v252                   \n"
"  v_add3_u32    v50, v252, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v253, v253                   \n"
"  v_add3_u32    v50, v253, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v238, v55, v54, s52                    \n"
"  v_cmp_u_f32   s[48:49], v254, v254                   \n"
"  v_add3_u32    v50, v254, v53, 1                      \n"
"  v_cndmask_b32  v54, v50, v52, s[48:49]               \n"
"  v_cmp_u_f32   s[48:49], v255, v255                   \n"
"  v_add3_u32    v50, v255, v53, 1                      \n"
"  v_cndmask_b32  v55, v50, v52, s[48:49]               \n"
"  v_perm_b32    v239, v55, v54, s52                    \n"
"  ds_write_b64  v3, v[224:225] offset:35072            \n"
"  ds_write_b64  v3, v[226:227] offset:43776            \n"
"  ds_write_b64  v3, v[228:229] offset:37248            \n"
"  ds_write_b64  v3, v[230:231] offset:45952            \n"
"  ds_write_b64  v3, v[232:233] offset:39424            \n"
"  ds_write_b64  v3, v[234:235] offset:48128            \n"
"  ds_write_b64  v3, v[236:237] offset:41600            \n"
"  ds_write_b64  v3, v[238:239] offset:50304            \n"
"  s_waitcnt     lgkmcnt(0)                             \n"
"  s_barrier                                            \n"
"  ds_read_b32   v64, v4 offset:35072                   \n"
"  ds_read_b32   v65, v4 offset:39424                   \n"
"  ds_read_b32   v66, v4 offset:35104                   \n"
"  ds_read_b32   v67, v4 offset:39456                   \n"
"  ds_read_b32   v68, v4 offset:35136                   \n"
"  ds_read_b32   v69, v4 offset:39488                   \n"
"  ds_read_b32   v70, v4 offset:35168                   \n"
"  ds_read_b32   v71, v4 offset:39520                   \n"
"  ds_read_b32   v72, v4 offset:43776                   \n"
"  ds_read_b32   v73, v4 offset:48128                   \n"
"  ds_read_b32   v74, v4 offset:43808                   \n"
"  ds_read_b32   v75, v4 offset:48160                   \n"
"  ds_read_b32   v76, v4 offset:43840                   \n"
"  ds_read_b32   v77, v4 offset:48192                   \n"
"  ds_read_b32   v78, v4 offset:43872                   \n"
"  ds_read_b32   v79, v4 offset:48224                   \n"
"  s_waitcnt     lgkmcnt(0)                             \n"
"  s_mov_b64     exec, s[20:21]                         \n"
"  global_atomic_pk_add_bf16   v80, v64, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[20:21]                         \n"
"  global_atomic_pk_add_bf16   v80, v65, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[22:23]                         \n"
"  global_atomic_pk_add_bf16   v82, v66, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[22:23]                         \n"
"  global_atomic_pk_add_bf16   v82, v67, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[24:25]                         \n"
"  global_atomic_pk_add_bf16   v84, v68, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[24:25]                         \n"
"  global_atomic_pk_add_bf16   v84, v69, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[26:27]                         \n"
"  global_atomic_pk_add_bf16   v86, v70, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[26:27]                         \n"
"  global_atomic_pk_add_bf16   v86, v71, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[28:29]                         \n"
"  global_atomic_pk_add_bf16   v88, v72, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[28:29]                         \n"
"  global_atomic_pk_add_bf16   v88, v73, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[30:31]                         \n"
"  global_atomic_pk_add_bf16   v90, v74, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[30:31]                         \n"
"  global_atomic_pk_add_bf16   v90, v75, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[32:33]                         \n"
"  global_atomic_pk_add_bf16   v92, v76, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[32:33]                         \n"
"  global_atomic_pk_add_bf16   v92, v77, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[34:35]                         \n"
"  global_atomic_pk_add_bf16   v94, v78, s[8:9] \n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_mov_b64     exec, s[34:35]                         \n"
"  global_atomic_pk_add_bf16   v94, v79, s[8:9] inst_offset:256\n"
"  s_mov_b64     exec, s[36:37]                         \n"
"  s_add_u32     s8, s59, s8                            \n"
"  s_addc_u32    s9, 0, s9                              \n"
"  s_addk_i32    s80, 0x0100                            \n"
"  s_cmp_lt_i32  s80, s81                               \n"
"  s_cbranch_scc0   label_2301           \n"
"  s_branch      label_0C3C  \n" 
" label_2301: \n"
"  s_waitcnt     0x0000                                 \n"
"  s_endpgm                                             \n"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_

