Commit 7cc808f2 authored by shengnxu's avatar shengnxu
Browse files

fix some codes

parent 6f7d1272
......@@ -245,10 +245,12 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
template <typename Ascale, typename GQscale, typename ARes, typename ACoords, typename BRes, typename BCoords>
template <typename Ascale, typename GQscale, typename DRes, typename DQRes, typename ARes, typename ACoords, typename BRes, typename BCoords>
CK_TILE_DEVICE auto
operator()( const Ascale& a_scale_,
const GQscale& gq_scale_,
const DRes& res_d,
const DQRes& res_dq,
const ARes& res_a,
const ACoords& cached_coords_a,
const BRes& res_b,
......@@ -446,14 +448,22 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
[a_scale1]"v"(a_scale_[1]),
[gq_scale0]"v"(gq_scale_[0]),
[gq_scale1]"v"(gq_scale_[1]),
[s_res_a]"s"(res_a),
// [s_res_a1]"s"(res_a[1]),
// [s_res_a2]"s"(res_a[2]),
// [s_res_a3]"s"(res_a[3]),
[s_res_b]"s"(res_b),
// [s_res_b1]"s"(res_b[1]),
// [s_res_b2]"s"(res_b[2]),
// [s_res_b3]"s"(res_b[3]),
[s_res_a0]"s"(res_a[0]),
[s_res_a1]"s"(res_a[1]),
[s_res_a2]"s"(res_a[2]),
[s_res_a3]"s"(res_a[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[s_res_d0]"s"(res_d[0]),
[s_res_d1]"s"(res_d[1]),
[s_res_d2]"s"(res_d[2]),
[s_res_d3]"s"(res_d[3]),
[s_res_dq0]"s"(res_dq[0]),
[s_res_dq1]"s"(res_dq[1]),
[s_res_dq2]"s"(res_dq[2]),
[s_res_dq3]"s"(res_dq[3]),
[v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))),
[v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))),
[v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))),
......
......@@ -78,8 +78,9 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template <typename DQRes,
typename BRes,
template <
// typename DQRes,
// typename BRes,
typename DQCoords,
typename BCoords,
typename ORes,
......@@ -88,8 +89,9 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
typename ScaleTensor,
typename YScaleTensor>
CK_TILE_DEVICE auto
operator()(const DQRes& res_dq,
const BRes& res_b,
operator()(
// const DQRes& res_dq,
// const BRes& res_b,
const DQCoords& cached_coords_dq,
const BCoords& cached_coords_b,
const ORes& res_o,
......@@ -118,38 +120,6 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
index_t loop_cnt = n ;
// register float v_c0 asm("v64");
// register float v_c1 asm("v65");
// register float v_c2 asm("v66");
// register float v_c3 asm("v67");
// register float v_c4 asm("v68");
// register float v_c5 asm("v69");
// register float v_c6 asm("v70");
// register float v_c7 asm("v71");
// register float v_c8 asm("v72");
// register float v_c9 asm("v73");
// register float v_c10 asm("v74");
// register float v_c11 asm("v75");
// register float v_c12 asm("v76");
// register float v_c13 asm("v77");
// register float v_c14 asm("v78");
// register float v_c15 asm("v79");
// register float v_c16 asm("v80");
// register float v_c17 asm("v81");
// register float v_c18 asm("v82");
// register float v_c19 asm("v83");
// register float v_c20 asm("v84");
// register float v_c21 asm("v85");
// register float v_c22 asm("v86");
// register float v_c23 asm("v87");
// register float v_c24 asm("v88");
// register float v_c25 asm("v89");
// register float v_c26 asm("v90");
// register float v_c27 asm("v91");
// register float v_c28 asm("v92");
// register float v_c29 asm("v93");
// register float v_c30 asm("v94");
// register float v_c31 asm("v95");
// int32_t nan_hi = 0x7fff0000;
// int32_t nan_lo = 0x00007fff;
......@@ -187,38 +157,6 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
#undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem),
[s_loop_cnt]"+s"(loop_cnt)
// [c0]"+v" (v_c0),
// [c1]"+v" (v_c1),
// [c2]"+v" (v_c2),
// [c3]"+v" (v_c3),
// [c4]"+v" (v_c4),
// [c5]"+v" (v_c5),
// [c6]"+v" (v_c6),
// [c7]"+v" (v_c7),
// [c8]"+v" (v_c8),
// [c9]"+v" (v_c9),
// [c10]"+v"(v_c10),
// [c11]"+v"(v_c11),
// [c12]"+v"(v_c12),
// [c13]"+v"(v_c13),
// [c14]"+v"(v_c14),
// [c15]"+v"(v_c15),
// [c16]"+v"(v_c16),
// [c17]"+v"(v_c17),
// [c18]"+v"(v_c18),
// [c19]"+v"(v_c19),
// [c20]"+v"(v_c20),
// [c21]"+v"(v_c21),
// [c22]"+v"(v_c22),
// [c23]"+v"(v_c23),
// [c24]"+v"(v_c24),
// [c25]"+v"(v_c25),
// [c26]"+v"(v_c26),
// [c27]"+v"(v_c27),
// [c28]"+v"(v_c28),
// [c29]"+v"(v_c29),
// [c30]"+v"(v_c30),
// [c31]"+v"(v_c31)
:[sld_a_base]"n"(0),
// [shfl_base]"n"(0),
// [v_sld_y_os]"v"(sld_y_os),
......@@ -226,15 +164,10 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// [v_sfl_sst]"v"(sfl_sst),
[smq_scale0]"s"(smq_scale_[0]),
[smq_scale1]"s"(smq_scale_[1]),
[s_res_dq]"s"(res_dq),
[s_res_o0]"s"(res_o[0]),
[s_res_o1]"s"(res_o[1]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[s_res_d]"s"(res_b),
// [s_res_b1]"s"(res_b[1]),
// [s_res_b2]"s"(res_b[2]),
// [s_res_b3]"s"(res_b[3]),
[v_os_dq]"v"(static_cast<index_t>(cached_coords_dq * sizeof(DScaleDataType))),
[v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
[v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
......@@ -334,52 +267,10 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
#undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem),
[s_loop_cnt]"+s"(loop_cnt)
// [c0]"+v" (v_c0),
// [c1]"+v" (v_c1),
// [c2]"+v" (v_c2),
// [c3]"+v" (v_c3),
// [c4]"+v" (v_c4),
// [c5]"+v" (v_c5),
// [c6]"+v" (v_c6),
// [c7]"+v" (v_c7),
// [c8]"+v" (v_c8),
// [c9]"+v" (v_c9),
// [c10]"+v"(v_c10),
// [c11]"+v"(v_c11),
// [c12]"+v"(v_c12),
// [c13]"+v"(v_c13),
// [c14]"+v"(v_c14),
// [c15]"+v"(v_c15),
// [c16]"+v"(v_c16),
// [c17]"+v"(v_c17),
// [c18]"+v"(v_c18),
// [c19]"+v"(v_c19),
// [c20]"+v"(v_c20),
// [c21]"+v"(v_c21),
// [c22]"+v"(v_c22),
// [c23]"+v"(v_c23),
// [c24]"+v"(v_c24),
// [c25]"+v"(v_c25),
// [c26]"+v"(v_c26),
// [c27]"+v"(v_c27),
// [c28]"+v"(v_c28),
// [c29]"+v"(v_c29),
// [c30]"+v"(v_c30),
// [c31]"+v"(v_c31)
:[sld_a_base]"n"(0),
// [shfl_base]"n"(0),
// [v_sld_y_os]"v"(sld_y_os),
// [v_sfl_sld]"v"(sfl_sld),
// [v_sfl_sst]"v"(sfl_sst),
[s_res_dq]"s"(res_dq),
[s_res_o0]"s"(res_o[0]),
[s_res_o1]"s"(res_o[1]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[s_res_d]"s"(res_b),
// [s_res_b1]"s"(res_b[1]),
// [s_res_b2]"s"(res_b[2]),
// [s_res_b3]"s"(res_b[3]),
[v_os_dq]"v"(static_cast<index_t>(cached_coords_dq * sizeof(DScaleDataType))),
[v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
[v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
[v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
......
......@@ -31,8 +31,8 @@
" v_lshrrev_b32 v3, 6, v0 \n"
" v_readfirstlane_b32 s7, v3 \n"
" s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen\n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen\n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
" v_mul_f32 v54, v128, v128 \n"
" v_mul_f32 v55, v129, v129 \n"
" v_mul_f32 v56, v130, v130 \n"
......@@ -65,7 +65,7 @@
" v_mul_f32 v129, v129, v55 \n"
" v_mul_f32 v130, v130, v56 \n"
" v_mul_f32 v131, v131, v57 \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v132, v132 \n"
" v_mul_f32 v55, v133, v133 \n"
" v_mul_f32 v56, v134, v134 \n"
......@@ -86,7 +86,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -99,7 +99,7 @@
" v_mul_f32 v133, v133, v55 \n"
" v_mul_f32 v134, v134, v56 \n"
" v_mul_f32 v135, v135, v57 \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_d], 0 offen\n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen\n"
" v_mul_f32 v54, v136, v136 \n"
" v_mul_f32 v55, v137, v137 \n"
" v_mul_f32 v56, v138, v138 \n"
......@@ -120,7 +120,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -133,7 +133,7 @@
" v_mul_f32 v137, v137, v55 \n"
" v_mul_f32 v138, v138, v56 \n"
" v_mul_f32 v139, v139, v57 \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v140, v140 \n"
" v_mul_f32 v55, v141, v141 \n"
" v_mul_f32 v56, v142, v142 \n"
......@@ -154,7 +154,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -168,7 +168,7 @@
" v_mul_f32 v142, v142, v56 \n"
" v_mul_f32 v143, v143, v57 \n"
" s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_d], 0 offen\n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen\n"
" v_mul_f32 v54, v144, v144 \n"
" v_mul_f32 v55, v145, v145 \n"
" v_mul_f32 v56, v146, v146 \n"
......@@ -189,7 +189,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -202,7 +202,7 @@
" v_mul_f32 v145, v145, v55 \n"
" v_mul_f32 v146, v146, v56 \n"
" v_mul_f32 v147, v147, v57 \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v148, v148 \n"
" v_mul_f32 v55, v149, v149 \n"
" v_mul_f32 v56, v150, v150 \n"
......@@ -223,7 +223,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -236,7 +236,7 @@
" v_mul_f32 v149, v149, v55 \n"
" v_mul_f32 v150, v150, v56 \n"
" v_mul_f32 v151, v151, v57 \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_d], 0 offen\n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen\n"
" v_mul_f32 v54, v152, v152 \n"
" v_mul_f32 v55, v153, v153 \n"
" v_mul_f32 v56, v154, v154 \n"
......@@ -257,7 +257,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -270,7 +270,7 @@
" v_mul_f32 v153, v153, v55 \n"
" v_mul_f32 v154, v154, v56 \n"
" v_mul_f32 v155, v155, v57 \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v156, v156 \n"
" v_mul_f32 v55, v157, v157 \n"
" v_mul_f32 v56, v158, v158 \n"
......@@ -291,7 +291,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
" s_add_u32 s12, %[s_tile_os_b_half], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" v_add_f32 v54, v54, 1.0 \n"
......@@ -307,7 +307,7 @@
" v_mul_f32 v158, v158, v56 \n"
" v_mul_f32 v159, v159, v57 \n"
" s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], %[s_res_d], 0 offen\n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen\n"
" v_mul_f32 v54, v160, v160 \n"
" v_mul_f32 v55, v161, v161 \n"
" v_mul_f32 v56, v162, v162 \n"
......@@ -328,7 +328,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -341,7 +341,7 @@
" v_mul_f32 v161, v161, v55 \n"
" v_mul_f32 v162, v162, v56 \n"
" v_mul_f32 v163, v163, v57 \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v164, v164 \n"
" v_mul_f32 v55, v165, v165 \n"
" v_mul_f32 v56, v166, v166 \n"
......@@ -362,7 +362,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -375,7 +375,7 @@
" v_mul_f32 v165, v165, v55 \n"
" v_mul_f32 v166, v166, v56 \n"
" v_mul_f32 v167, v167, v57 \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], %[s_res_d], 0 offen\n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen\n"
" v_mul_f32 v54, v168, v168 \n"
" v_mul_f32 v55, v169, v169 \n"
" v_mul_f32 v56, v170, v170 \n"
......@@ -396,7 +396,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -409,7 +409,7 @@
" v_mul_f32 v169, v169, v55 \n"
" v_mul_f32 v170, v170, v56 \n"
" v_mul_f32 v171, v171, v57 \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v172, v172 \n"
" v_mul_f32 v55, v173, v173 \n"
" v_mul_f32 v56, v174, v174 \n"
......@@ -430,7 +430,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -444,7 +444,7 @@
" v_mul_f32 v174, v174, v56 \n"
" v_mul_f32 v175, v175, v57 \n"
" s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], %[s_res_d], 0 offen\n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen\n"
" v_mul_f32 v54, v176, v176 \n"
" v_mul_f32 v55, v177, v177 \n"
" v_mul_f32 v56, v178, v178 \n"
......@@ -465,7 +465,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -478,7 +478,7 @@
" v_mul_f32 v177, v177, v55 \n"
" v_mul_f32 v178, v178, v56 \n"
" v_mul_f32 v179, v179, v57 \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v180, v180 \n"
" v_mul_f32 v55, v181, v181 \n"
" v_mul_f32 v56, v182, v182 \n"
......@@ -499,7 +499,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -512,7 +512,7 @@
" v_mul_f32 v181, v181, v55 \n"
" v_mul_f32 v182, v182, v56 \n"
" v_mul_f32 v183, v183, v57 \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], %[s_res_d], 0 offen\n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen\n"
" v_mul_f32 v54, v184, v184 \n"
" v_mul_f32 v55, v185, v185 \n"
" v_mul_f32 v56, v186, v186 \n"
......@@ -533,7 +533,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -546,7 +546,7 @@
" v_mul_f32 v185, v185, v55 \n"
" v_mul_f32 v186, v186, v56 \n"
" v_mul_f32 v187, v187, v57 \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v188, v188 \n"
" v_mul_f32 v55, v189, v189 \n"
" v_mul_f32 v56, v190, v190 \n"
......@@ -567,7 +567,7 @@
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
......@@ -644,7 +644,7 @@
" v_mul_f32 v189, %[smq_scale1], v189 row_newbcast:13 \n"
" v_mul_f32 v190, %[smq_scale1], v190 row_newbcast:14 \n"
" v_mul_f32 v191, %[smq_scale1], v191 row_newbcast:15 \n"
" buffer_load_dword v12, %[v_os_dq], %[s_res_dq], 0 offen \n"
" buffer_load_dword v12, %[v_os_dq], s[16:19], 0 offen \n"
" v_mov_b32 v22, 0x358637bd \n"
" v_mov_b32 v23, 0x358637bd \n"
" v_max3_f32 v22, abs(v128), abs(v129), v22 \n"
......@@ -974,5 +974,3 @@
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
......@@ -27,179 +27,179 @@
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
" s_add_u32 %[s_res_d][0], %[s_tile_os_b], %[s_res_d][0] \n"
" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n"
" s_add_u32 %[s_res_dq][0], %[s_tile_os_dq], %[s_res_dq][0] \n"
" s_addc_u32 %[s_res_dq][1], 0, %[s_res_dq][1] \n"
" s_add_u32 s12, %[s_tile_os_b], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_add_u32 s16, %[s_tile_os_dq], s16 \n"
" s_addc_u32 s17, 0, s17 \n"
" s_mov_b32 s80, 0 \n"
" s_waitcnt 0x0000 \n"
"label_startgemm2: \n"
" label_startgemm2: \n"
" s_waitcnt vmcnt(41) \n"
" s_barrier \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0 \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195] \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195] \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0 \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199] \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199] \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0 \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203] \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203] \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0 \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207] \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207] \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207] \n"
" s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0 \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211] \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211] \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0 \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215] \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215] \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0 \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219] \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219] \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0 \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223] \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223] \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], %[s_res_d], 0 offen offset:3072 \n"
" s_add_u32 %[s_res_d][0], %[s_tile_os_b_half], %[s_res_d][0] \n"
" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
" s_add_u32 s12, %[s_tile_os_b_half], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[62:63], v[174:175], v[220:223] \n"
" s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195] \n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b0], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b0], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195] \n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b0], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199] \n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b0], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199] \n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b0], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203] \n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b1], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b1], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203] \n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b1], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207] \n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b1], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207] \n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b1], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207] \n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207] \n"
" s_waitcnt vmcnt(40) \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211] \n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b2], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b2], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211] \n"
" buffer_load_dword v13, %[v_os_dq], %[s_res_dq], 0 offen \n"
" buffer_load_dword v13, %[v_os_dq], s[16:19], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211] \n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b2], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215] \n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b2], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215] \n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b2], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219] \n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b3], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b3], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219] \n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b3], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223] \n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b3], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223] \n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b3], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223] \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223] \n"
" s_add_u32 s60, 0x00000200, s80 \n"
......@@ -207,10 +207,10 @@
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0 \n"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0 \n"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0 \n"
" s_add_u32 %[s_res_d][0], %[s_tile_os_b], %[s_res_d][0] \n"
" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n"
" s_add_u32 %[s_res_dq][0], %[s_tile_os_dq], %[s_res_dq][0] \n"
" s_addc_u32 %[s_res_dq][1], 0, %[s_res_dq][1] \n"
" s_add_u32 s12, %[s_tile_os_b], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_add_u32 s16, %[s_tile_os_dq], s16 \n"
" s_addc_u32 s17, 0, s17 \n"
" v_cvt_f32_i32 v192, v192 \n"
" v_cvt_f32_i32 v193, v193 \n"
" v_cvt_f32_i32 v194, v194 \n"
......@@ -491,40 +491,40 @@
" global_atomic_pk_add_bf16 %[v_os_o1], v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 %[v_os_o3], v68, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o2], v68, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 %[v_os_o3], v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o2], v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 %[v_os_o4], v70, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o3], v70, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 %[v_os_o4], v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o3], v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 %[v_os_o5], v72, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o4], v72, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 %[v_os_o5], v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o4], v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 %[v_os_o6], v74, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o5], v74, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 %[v_os_o6], v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o5], v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 %[v_os_o7], v76, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o6], v76, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 %[v_os_o7], v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o6], v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 %[v_os_o8], v78, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o7], v78, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 %[v_os_o8], v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o7], v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_add_u32 %[s_res_o0], s59, %[s_res_o0] \n"
" s_addc_u32 %[s_res_o1], 0, %[s_res_o1] \n"
......@@ -535,168 +535,168 @@
" s_barrier \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0 \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227] \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227] \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0 \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231] \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231] \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0 \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235] \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235] \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0 \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239] \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239] \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239] \n"
" s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0 \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243] \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243] \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0 \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247] \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247] \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0 \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251] \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251] \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0 \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255] \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_d], 0 offen offset:3072 \n"
" s_add_u32 %[s_res_d][0], %[s_tile_os_b_half], %[s_res_d][0] \n"
" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
" s_add_u32 s12, %[s_tile_os_b_half], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[190:191], v[174:175], v[252:255] \n"
" s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227] \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227] \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231] \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231] \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235] \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235] \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239] \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239] \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239] \n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239] \n"
" s_waitcnt vmcnt(40) \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243] \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243] \n"
" buffer_load_dword v12, %[v_os_dq], %[s_res_dq], 0 offen \n"
" buffer_load_dword v12, %[v_os_dq], s[16:19], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243] \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247] \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247] \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251] \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], %[s_res_d], 0 offen \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251] \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], %[s_res_d], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255] \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], %[s_res_d], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255] \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], %[s_res_d], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255] \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255] \n"
" s_add_u32 s60, 0x00000200, s80 \n"
......@@ -704,10 +704,10 @@
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0 \n"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0 \n"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0 \n"
" s_add_u32 %[s_res_d][0], %[s_tile_os_b], %[s_res_d][0] \n"
" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1] \n"
" s_add_u32 %[s_res_dq][0], %[s_tile_os_dq], %[s_res_dq][0] \n"
" s_addc_u32 %[s_res_dq][1], 0, %[s_res_dq][1] \n"
" s_add_u32 s12, %[s_tile_os_b], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_add_u32 s16, %[s_tile_os_dq], s16 \n"
" s_addc_u32 s17, 0, s17 \n"
" v_cvt_f32_i32 v224, v224 \n"
" v_cvt_f32_i32 v225, v225 \n"
" v_cvt_f32_i32 v226, v226 \n"
......@@ -988,40 +988,40 @@
" global_atomic_pk_add_bf16 %[v_os_o1], v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 %[v_os_o3], v68, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o2], v68, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 %[v_os_o3], v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o2], v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 %[v_os_o4], v70, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o3], v70, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 %[v_os_o4], v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o3], v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 %[v_os_o5], v72, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o4], v72, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 %[v_os_o5], v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o4], v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 %[v_os_o6], v74, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o5], v74, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 %[v_os_o6], v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o5], v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 %[v_os_o7], v76, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o6], v76, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 %[v_os_o7], v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o6], v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 %[v_os_o8], v78, [%[s_res_o0],%[s_res_o1]] \n"
" global_atomic_pk_add_bf16 %[v_os_o7], v78, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 %[v_os_o8], v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" global_atomic_pk_add_bf16 %[v_os_o7], v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256 \n"
" s_mov_b64 exec, s[36:37] \n"
" s_add_u32 %[s_res_o0], s59, %[s_res_o0] \n"
" s_addc_u32 %[s_res_o1], 0, %[s_res_o1] \n"
......@@ -1038,3 +1038,5 @@
......@@ -19,6 +19,20 @@
" v_mul_f32 " a2 ", " gq ", " a2 " row_newbcast: " brd2 " \n" \
" v_mul_f32 " a3 ", " gq ", " a3 " row_newbcast:" brd3 " \n"
"s_mov_b32 s20, %[s_res_a0] \n"
"s_mov_b32 s21, %[s_res_a1] \n"
"s_mov_b32 s22, %[s_res_a2] \n"
"s_mov_b32 s23, %[s_res_a3] \n"
"s_mov_b32 s24, %[s_res_b0] \n"
"s_mov_b32 s25, %[s_res_b1] \n"
"s_mov_b32 s26, %[s_res_b2] \n"
"s_mov_b32 s27, %[s_res_b3] \n"
"s_mov_b32 s16, %[s_res_dq0] \n"
"s_mov_b32 s17, %[s_res_dq1] \n"
"s_mov_b32 s12, %[s_res_d0] \n"
"s_mov_b32 s13, %[s_res_d1] \n"
"s_mov_b32 s14, %[s_res_d2] \n"
"s_mov_b32 s15, %[s_res_d3] \n"
";---------------------------------------------- \n"
" s_mov_b32 s57, 0x00000100 \n"
" s_mov_b32 s58, 0x00001000 \n"
......@@ -56,76 +70,76 @@
";---------------------------------------------- \n"
"; -- prefetch A0\n"
"s_add_u32 m0, 0, %[s_m0_init] \n"
"buffer_load_dword %[v_os_a0], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a1], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a1], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a2], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a2], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a3], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a3], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a4], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a4], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a5], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a5], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a6], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a6], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a7], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a7], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[smem_sz], %[s_m0_init] \n"
" s_add_u32 s20, s57, s20 \n"
" s_addc_u32 s21, 0, s21 \n"
"; -- prefetch A1\n"
"buffer_load_dword %[v_os_a0], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a1], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a1], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a2], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a2], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a3], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a3], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a4], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a4], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a5], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a5], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a6], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a6], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a7], %[s_res_a], 0 offen lds \n"
"buffer_load_dword %[v_os_a7], s[20:23], 0 offen lds \n"
"s_add_u32 m0, 0, %[s_m0_init] \n"
" s_add_u32 s20, s57, s20 \n"
" s_addc_u32 s21, 0, s21 \n"
"; -- prefetch B0\n"
"buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[64:67], %[v_os_b4], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[68:71], %[v_os_b4], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[72:75], %[v_os_b4], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[76:79], %[v_os_b4], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[80:83], %[v_os_b5], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[84:87], %[v_os_b5], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[88:91], %[v_os_b5], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[92:95], %[v_os_b5], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[96:99], %[v_os_b6], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[100:103], %[v_os_b6], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[104:107], %[v_os_b6], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[108:111], %[v_os_b6], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[112:115], %[v_os_b7], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[116:119], %[v_os_b7], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[120:123], %[v_os_b7], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[124:127], %[v_os_b7], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072 \n"
"s_add_u32 s24, s58, s24 \n"
"s_addc_u32 s25, 0, s25 \n"
......@@ -146,189 +160,189 @@
" s_barrier \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[192:193], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[194:195], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[196:197], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[198:199], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword %[v_os_a0] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a0] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[200:201], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[202:203], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[204:205], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[206:207], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword %[v_os_a1] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a1] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword %[v_os_a2] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a2] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword %[v_os_a3] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a3] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[192:193], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[194:195], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[196:197], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[198:199], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword %[v_os_a4] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a4] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[200:201], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[202:203], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[204:205], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[206:207], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword %[v_os_a5] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a5] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword %[v_os_a6] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a6] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword %[v_os_a7] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a7] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[smem_sz], %[s_m0_init] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[32:33], v[192:193], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[34:35], v[194:195], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[36:37], v[196:197], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[38:39], v[198:199], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[224:227], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_0] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[40:41], v[200:201], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[42:43], v[202:203], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[44:45], v[204:205], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[46:47], v[206:207], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[228:231], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_1] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[32:33], v[208:209], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[34:35], v[210:211], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[36:37], v[212:213], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[38:39], v[214:215], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[232:235], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_2] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[40:41], v[216:217], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[42:43], v[218:219], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[44:45], v[220:221], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[46:47], v[222:223], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[236:239], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_3] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[48:49], v[192:193], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[50:51], v[194:195], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[52:53], v[196:197], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[54:55], v[198:199], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[240:243], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_4] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[56:57], v[200:201], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[58:59], v[202:203], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[60:61], v[204:205], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[62:63], v[206:207], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[244:247], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_5] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[48:49], v[208:209], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[50:51], v[210:211], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[52:53], v[212:213], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[54:55], v[214:215], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[248:251], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_6] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[56:57], v[216:217], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[58:59], v[218:219], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[60:61], v[220:221], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[62:63], v[222:223], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[252:255], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_7] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[64:65], v[192:193], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[66:67], v[194:195], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[68:69], v[196:197], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[70:71], v[198:199], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[72:73], v[200:201], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[74:75], v[202:203], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[76:77], v[204:205], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[78:79], v[206:207], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[64:65], v[208:209], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[66:67], v[210:211], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[68:69], v[212:213], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[70:71], v[214:215], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[72:73], v[216:217], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[74:75], v[218:219], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[76:77], v[220:221], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[78:79], v[222:223], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[80:81], v[192:193], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[82:83], v[194:195], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[84:85], v[196:197], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[86:87], v[198:199], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[88:89], v[200:201], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[90:91], v[202:203], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[92:93], v[204:205], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[94:95], v[206:207], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[80:81], v[208:209], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[82:83], v[210:211], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[84:85], v[212:213], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[86:87], v[214:215], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[88:89], v[216:217], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[90:91], v[218:219], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[92:93], v[220:221], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[94:95], v[222:223], [%[c44], %[c45], %[c46], %[c47]] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[96:97], v[192:193], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[98:99], v[194:195], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[100:101], v[196:197], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[102:103], v[198:199], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[104:105], v[200:201], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[106:107], v[202:203], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[108:109], v[204:205], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[110:111], v[206:207], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[96:97], v[208:209], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[98:99], v[210:211], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[100:101], v[212:213], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[102:103], v[214:215], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[104:105], v[216:217], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[106:107], v[218:219], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[108:109], v[220:221], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[110:111], v[222:223], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[112:113], v[192:193], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[114:115], v[194:195], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[116:117], v[196:197], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[118:119], v[198:199], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[120:121], v[200:201], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[122:123], v[202:203], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[124:125], v[204:205], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[126:127], v[206:207], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[112:113], v[208:209], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[114:115], v[210:211], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[116:117], v[212:213], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[118:119], v[214:215], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[120:121], v[216:217], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[122:123], v[218:219], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[124:125], v[220:221], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[126:127], v[222:223], [%[c60], %[c61], %[c62], %[c63]] \n"
" s_add_u32 s60, 0x00000300, s80 \n"
......@@ -348,189 +362,189 @@
" s_barrier \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[128:129], v[224:225], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[130:131], v[226:227], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[132:133], v[228:229], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[134:135], v[230:231], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword %[v_os_a0] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a0] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[136:137], v[232:233], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[138:139], v[234:235], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[140:141], v[236:237], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[142:143], v[238:239], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword %[v_os_a1] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a1] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[128:129], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[130:131], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[132:133], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[134:135], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword %[v_os_a2] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a2] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[136:137], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[138:139], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[140:141], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[142:143], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword %[v_os_a3] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a3] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[144:145], v[224:225], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[146:147], v[226:227], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[148:149], v[228:229], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[150:151], v[230:231], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword %[v_os_a4] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a4] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[152:153], v[232:233], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[154:155], v[234:235], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[156:157], v[236:237], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[158:159], v[238:239], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword %[v_os_a5] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a5] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[144:145], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[146:147], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[148:149], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[150:151], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword %[v_os_a6] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a6] s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[152:153], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[154:155], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[156:157], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[158:159], v[254:255], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword %[v_os_a7] %[s_res_a], 0 offen lds \n"
" buffer_load_dword %[v_os_a7] s[20:23], 0 offen lds \n"
" s_add_u32 m0, 0, %[s_m0_init] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[224:225], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[226:227], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[228:229], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[230:231], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[232:233], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[234:235], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[236:237], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[238:239], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[240:241], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[242:243], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[244:245], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[246:247], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[248:249], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[250:251], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[252:253], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[254:255], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[224:225], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[226:227], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[228:229], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[230:231], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[232:233], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[234:235], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[236:237], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[238:239], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[240:241], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[242:243], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[244:245], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[246:247], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[248:249], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[250:251], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[252:253], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[254:255], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[192:193], v[224:225], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[194:195], v[226:227], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[196:197], v[228:229], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[198:199], v[230:231], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[200:201], v[232:233], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[202:203], v[234:235], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[204:205], v[236:237], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[206:207], v[238:239], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[192:193], v[240:241], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[194:195], v[242:243], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[196:197], v[244:245], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[198:199], v[246:247], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[200:201], v[248:249], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[202:203], v[250:251], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[204:205], v[252:253], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[206:207], v[254:255], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[208:209], v[224:225], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[210:211], v[226:227], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[212:213], v[228:229], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[214:215], v[230:231], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[216:217], v[232:233], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[218:219], v[234:235], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[220:221], v[236:237], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[222:223], v[238:239], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[208:209], v[240:241], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[210:211], v[242:243], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[212:213], v[244:245], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[214:215], v[246:247], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[216:217], v[248:249], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[218:219], v[250:251], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[220:221], v[252:253], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[222:223], v[254:255], [%[c44], %[c45], %[c46], %[c47]] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[224:225], v[224:225], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[226:227], v[226:227], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[228:229], v[228:229], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[230:231], v[230:231], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[232:233], v[232:233], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[234:235], v[234:235], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[236:237], v[236:237], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[238:239], v[238:239], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[224:225], v[240:241], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[226:227], v[242:243], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[228:229], v[244:245], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[230:231], v[246:247], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[232:233], v[248:249], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[234:235], v[250:251], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[236:237], v[252:253], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[238:239], v[254:255], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[240:241], v[224:225], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[242:243], v[226:227], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], %[s_res_b], 0 offen \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[244:245], v[228:229], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[246:247], v[230:231], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[248:249], v[232:233], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[250:251], v[234:235], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], %[s_res_b], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[252:253], v[236:237], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[254:255], v[238:239], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[240:241], v[240:241], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[242:243], v[242:243], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], %[s_res_b], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[244:245], v[244:245], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[246:247], v[246:247], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[248:249], v[248:249], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[250:251], v[250:251], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], %[s_res_b], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[252:253], v[252:253], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[254:255], v[254:255], [%[c60], %[c61], %[c62], %[c63]] \n"
" s_add_u32 s60, 0x00000300, s80 \n"
......@@ -569,3 +583,4 @@ _DEQUAN_CVT_("%[c60]","%[c61]","%[c62]","%[c63]","%[a_scale1]"," %[gq_scale1]","
#undef _UK_MFMA_
#undef _DEQUAN_CVT_
......@@ -157,7 +157,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
return w;
}
template <typename ROW_COORDS>
template <typename ROW_IDS>
CK_TILE_DEVICE auto GetAScale(const ROW_IDS row_ids_mma,
const AScaleDataType* a_scale_ptr)
{
......@@ -165,9 +165,9 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
array<TopkWeightDataType, n_size> w;
static_for<0, n_size, 1>{}([&](auto i) {
auto row_id = row_idx_mma[i] & 0xffffff;
auto itp_k = row_idx_mma[i] >> 24;
w.at(i) = sorted_weight_ptr[row_id *kargs.topk+itp_k];
auto row_id = row_ids_mma[i] & 0xffffff;
auto itp_k = row_ids_mma[i] >> 24;
w.at(i) = a_scale_ptr[row_id * 5+itp_k];
});
return w;
......@@ -199,13 +199,14 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
// auto q_is = threadIdx.x & 0x3;
array<index_t, Repeat_N> coords;
static_for<0, Repeat_N, 1>{}([&](auto i) { coords.at(i) = base_coord + (threadIdx.x / MLanes) * 4 +
static_for<0, Repeat_N, 1>{}([&](auto i) { coords.at(i) = base_offset + (threadIdx.x / MLanes) * 4 +
(threadIdx.x & 0xffff)/4 * 64 +
q_id +
threadIdx.x & 0x3 +
i * 256 ; });
return coords;
}
//this calculation shared by G and SMQ
template <typename COL_IDS>
CK_TILE_DEVICE auto GetGQScale(const COL_IDS coords,
const GScaleDataType* g_scale_ptr)
{
......@@ -218,6 +219,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
return g_scale_value;
}
template <typename COL_IDS>
CK_TILE_DEVICE auto GetSMQScale(const COL_IDS coords,
const YSmoothScaleDataType * y_scale_ptr)
{
......@@ -251,8 +253,6 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size;
index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size;
/////////////
index_t g_scale_expert_stride_0 = shared_intermediate_size_0;
index_t smq_scale_expert_stride_0 = shared_intermediate_size_0;
index_t d_scale_expert_stride_1 = kargs.hidden_size;
// nr*kr*w
index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane(
......@@ -283,20 +283,6 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto a_res =
make_wave_buffer_resource(reinterpret_cast<const ADataType*>(kargs.a_ptr),
kargs.num_tokens * kargs.stride_token * sizeof(ADataType));
//////aq
auto aq_win = [&]() {
const AScaleDataType* aq_ptr = reinterpret_cast<const AScaleDataType*>(kargs.a_scale_ptr);
auto aq_view_ = make_naive_tensor_view<address_space_enum::global>(
aq_ptr,
make_tuple(kargs.num_tokens * kargs.topk),
make_tuple(1),
number<1>{},
number<1>{});
return aq_view_;
}();
auto aq_res = aq_win.get_buffer_view().cached_buf_res_;
////////
auto g_win = [&]() {
const GDataType* g_ptr = reinterpret_cast<const GDataType*>(kargs.g_ptr) +
......@@ -323,40 +309,6 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto g_res = g_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_;
auto g_coords = generate_tuple([&](auto i) { return g_win.cached_coords_[i].get_offset(); },
number<decltype(g_win)::NumAccess_NonLinear>{});
//////gq
auto gq_win = [&]() {
const GScaleDataType* gq_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr) +
static_cast<long_index_t>(expert_id) * g_scale_expert_stride_0 +
intermediate_tile_id * BlockShape::Block_N0;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline
auto gq_view_ = make_naive_tensor_view<address_space_enum::global>(
gq_ptr,
make_tuple(shared_intermediate_size_1),
make_tuple(1),
number<1>{},
number<1>{});
return gq_view_;
}();
auto gq_res = gq_win.get_buffer_view().cached_buf_res_;
////smQ
auto smq_win = [&]() {
const YSmoothScaleDataType* smq_ptr = reinterpret_cast<const YSmoothScaleDataType*>(kargs.y_smooth_scale_ptr) +
static_cast<long_index_t>(expert_id) * smq_scale_expert_stride_0 +
intermediate_tile_id * BlockShape::Block_K1;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline
auto smq_view_ = make_naive_tensor_view<address_space_enum::global>(
smq_ptr,
make_tuple(shared_intermediate_size_1),
make_tuple(1),
number<1>{},
number<1>{});
return smq_view_;
}();
auto smq_res = smq_win.get_buffer_view().cached_buf_res_;
/////////////////////
const auto d_win = [&]() {
const DDataType* d_ptr = reinterpret_cast<const DDataType*>(kargs.d_ptr) +
......@@ -395,7 +347,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
return dq_view_;
}();
auto dq_res = dq_win.get_buffer_view().cached_buf_res_;
auto dq_res = dq_win.get_buffer_view().cached_buf_res_;
////
// TODO: load D order is N0.K0...127, N64.K0...127, N0.K128...255, N64.K128...255
......@@ -447,16 +399,18 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
row_coords_o, reinterpret_cast<const TopkWeightDataType*>(kargs.sorted_weight_ptr));
auto a_scale = GetAScale(
row_ids_a_mma, reinterpret_cast<const AScaleDataType*>(kargs.a_scale_ptr));
auto gqsmq_coords = GetColCoords_GQSMQ(intermediated_tile_id * BlockShape::Block_K1);
auto gqsmq_coords = GetColCoords_GQSMQ(intermediate_tile_id * BlockShape::Block_K1);
auto dq_coords = gqsmq_coords[0];//only one for this tiling
auto gq_scale = GetGQScale(
gqsmq_coords, reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr + static_cast<long_index_t>(expert_id) * shared_intermediate_size_0));
gqsmq_coords, (reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr) + static_cast<long_index_t>(expert_id) * shared_intermediate_size_0));
auto smq_scale = GetSMQScale(
gqsmq_coords, reinterpret_cast<const YSmoothScaleDataType*>(kargs.y_smooth_scale_ptr + static_cast<long_index_t>(expert_id) * shared_intermediate_size_0));
gqsmq_coords, (reinterpret_cast<const YSmoothScaleDataType*>(kargs.y_smooth_scale_ptr) + static_cast<long_index_t>(expert_id) * shared_intermediate_size_0));
auto uk_0 = Policy::template GetUK_0<Problem>();
// auto acc_0= uk_0(
uk_0( a_scale,
gq_scale,
uk_0( a_scale,
gq_scale,
d_res,
dq_res,
a_res,
a_coords,
g_res,
......@@ -485,8 +439,9 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
// block_sync_lds();
auto uk_1 = Policy::template GetUK_1<Problem>();
uk_1(dq_res,
d_res,
uk_1(
// dq_res,
// d_res,
dq_coords,
d_coords,
o_res,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment