"example/12_reduce/README.md" did not exist on "e823d518cb46ad61ddb3c70eac8529e0a58af1f8"
Commit 9a46c0e7 authored by shengnxu's avatar shengnxu
Browse files

move a scale out inline

parent 26d84960
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "fused_moegemm_api_traits.hpp" #include "fused_moegemm_api_traits.hpp"
#include "ck_tile/ops/fused_moe.hpp" #include "ck_tile/ops/fused_moe.hpp"
#include "fused_moegemm_api.cpp" // #include "fused_moegemm_api.cpp"
#include <iostream> #include <iostream>
template <ck_tile::index_t... Is> template <ck_tile::index_t... Is>
......
...@@ -264,6 +264,7 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -264,6 +264,7 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 4 /*2x per dword*/); // 8 static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 4 /*2x per dword*/); // 8
static_assert(BCoords::size() == Repeat_N); static_assert(BCoords::size() == Repeat_N);
static_assert(AToken_id::size() == Repeat_M); static_assert(AToken_id::size() == Repeat_M);
static_assert(Ascale::size() == Repeat_M);
auto a_sst = make_tile_window( auto a_sst = make_tile_window(
make_tensor_view<address_space_enum::lds>( make_tensor_view<address_space_enum::lds>(
...@@ -451,30 +452,18 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -451,30 +452,18 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
[v_token_id0]"+v"(temp0), [v_token_id0]"+v"(temp0),
[v_token_id1]"+v"(temp1), [v_token_id1]"+v"(temp1),
[s_mem_]"+r"(smem) [s_mem_]"+r"(smem)
: [s_res_aq0]"s"(res_aq[0]), : [s_res_aq]"s"(res_aq),
[s_res_aq1]"s"(res_aq[1]), [s_res_dq]"s"(res_dq),
[s_res_aq2]"s"(res_aq[2]), [s_res_gq]"s"(res_gq),
[s_res_aq3]"s"(res_aq[3]), [s_res_smq]"s"(res_smq),
[s_res_dq0]"s"(res_dq[0]), [s_res_a]"s"(res_a),
[s_res_dq1]"s"(res_dq[1]), // [s_res_a1]"s"(res_a[1]),
[s_res_dq2]"s"(res_dq[2]), // [s_res_a2]"s"(res_a[2]),
[s_res_dq3]"s"(res_dq[3]), // [s_res_a3]"s"(res_a[3]),
[s_res_gq0]"s"(res_gq[0]), [s_res_b]"s"(res_b),
[s_res_gq1]"s"(res_gq[1]), // [s_res_b1]"s"(res_b[1]),
[s_res_gq2]"s"(res_gq[2]), // [s_res_b2]"s"(res_b[2]),
[s_res_gq3]"s"(res_gq[3]), // [s_res_b3]"s"(res_b[3]),
[s_res_smq0]"s"(res_smq[0]),
[s_res_smq1]"s"(res_smq[1]),
[s_res_smq2]"s"(res_smq[2]),
[s_res_smq3]"s"(res_smq[3]),
[s_res_a0]"s"(res_a[0]),
[s_res_a1]"s"(res_a[1]),
[s_res_a2]"s"(res_a[2]),
[s_res_a3]"s"(res_a[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))), [v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))),
[v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))), [v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))),
[v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))), [v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))),
...@@ -539,21 +528,15 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -539,21 +528,15 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255", "a252", "a253", "a254", "a255",
"s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s6", "s7", "s40", "s41", "s42", "s43", "s44", "s45",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25",
"s26", "s27", "s28", "s29", "s30", "s31", "s32", "s33", "s34", "s35",
"s36", "s37", "s38", "s39", "s40", "s41", "s42", "s43", "s44", "s45",
"s46", "s47", "s48", "s49", "s50", "s51", "s52", "s53", "s54", "s46", "s47", "s48", "s49", "s50", "s51", "s52", "s53", "s54",
"s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63", "s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63",
"s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72", "s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72",
"s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp "s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp
"v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v20", "v21", "v22", "v23", "v24", "v25", "v50", "v51", "v52", "v53", "v54", "v55",
"v29", "v30", "v31", "v32", "v33", "v34", "v35", "v36", "v37", "v56", "v57", "v64",
"v38", "v39", "v40", "v41", "v42", "v43", "v44", "v45", "v46",
"v47", "v48", "v49", "v50", "v51", "v52", "v53", "v54", "v55",
"v56", "v57", "v58", "v59", "v60", "v61", "v62", "v63", "v64",
"v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73", "v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73",
"v74", "v75", "v76", "v77", "v78", "v79", "v80", "v81", "v82", "v74", "v75", "v76", "v77", "v78", "v79", "v80", "v81", "v82",
"v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", "v91", "v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", "v91",
......
...@@ -78,21 +78,23 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -78,21 +78,23 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// TODO: need paired with tile_window_linear! // TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function! // TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor> // template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template <typename BRes, template <typename DQRes,
typename BRes,
typename BCoords, typename BCoords,
typename ORes, typename ORes,
typename OCoords, typename OCoords,
typename OFlags, typename OFlags>
typename ScaleTensor> // typename ScaleTensor>
CK_TILE_DEVICE auto CK_TILE_DEVICE auto
operator()(const BRes& res_b, operator()(const DQRes& res_dq,
const BRes& res_b,
const BCoords& cached_coords_b, const BCoords& cached_coords_b,
const ORes& res_o, const ORes& res_o,
const OCoords& cached_coords_o, const OCoords& cached_coords_o,
const OFlags& o_flags, // this should be in sgpr const OFlags& o_flags, // this should be in sgpr
CK_TILE_LDS_ADDR void* smem, CK_TILE_LDS_ADDR void* smem,
index_t n, // loop along n dim index_t n, // loop along n dim
const ScaleTensor& scale_, // const ScaleTensor& scale_,
index_t tile_offset_dq, index_t tile_offset_dq,
index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
index_t tile_offset_half_b, //splited load alone K in to 2 part index_t tile_offset_half_b, //splited load alone K in to 2 part
...@@ -106,11 +108,11 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -106,11 +108,11 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType); const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
const index_t tile_stride_dq_bytes = tile_offset_dq * sizeof(DScaleDataType); const index_t tile_stride_dq_bytes = tile_offset_dq * sizeof(DScaleDataType);
static_assert(ScaleTensor::size() == 2); // static_assert(ScaleTensor::size() == 2);
float s0 = scale_[number<0>{}]; // float s0 = scale_[number<0>{}];
float s1 = scale_[number<1>{}]; // float s1 = scale_[number<1>{}];
index_t loop_cnt = n / Block_N; index_t loop_cnt = n ;
// register float v_c0 asm("v64"); // register float v_c0 asm("v64");
// register float v_c1 asm("v65"); // register float v_c1 asm("v65");
...@@ -144,15 +146,15 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -144,15 +146,15 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// register float v_c29 asm("v93"); // register float v_c29 asm("v93");
// register float v_c30 asm("v94"); // register float v_c30 asm("v94");
// register float v_c31 asm("v95"); // register float v_c31 asm("v95");
int32_t nan_hi = 0x7fff0000; // int32_t nan_hi = 0x7fff0000;
int32_t nan_lo = 0x00007fff; // int32_t nan_lo = 0x00007fff;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4) // in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register // every threads need 8xK in contiguous register
// ... and every wave need the same data // ... and every wave need the same data
int lane_id = threadIdx.x % 64; // int lane_id = threadIdx.x % 64;
int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128; // int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
sld_y_os *= 2; // sld_y_os *= 2;
// y y p p p y // y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4) // reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
...@@ -161,15 +163,15 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -161,15 +163,15 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v // y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4 // sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4); // int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
sfl_sst *= 2; // sfl_sst *= 2;
// from LDS we need load as // from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4) // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2) // ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4 // sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4; // int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
sfl_sld *= 2; // sfl_sld *= 2;
// B nr->kr // B nr->kr
// clang-format off // clang-format off
...@@ -214,18 +216,19 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -214,18 +216,19 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// [c30]"+v"(v_c30), // [c30]"+v"(v_c30),
// [c31]"+v"(v_c31) // [c31]"+v"(v_c31)
:[sld_a_base]"n"(0), :[sld_a_base]"n"(0),
[shfl_base]"n"(0), // [shfl_base]"n"(0),
[v_sld_y_os]"v"(sld_y_os), // [v_sld_y_os]"v"(sld_y_os),
[v_sfl_sld]"v"(sfl_sld), // [v_sfl_sld]"v"(sfl_sld),
[v_sfl_sst]"v"(sfl_sst), // [v_sfl_sst]"v"(sfl_sst),
[s_res_dq]"s"(res_dq),
[s_res_o0]"s"(res_o[0]), [s_res_o0]"s"(res_o[0]),
[s_res_o1]"s"(res_o[1]), [s_res_o1]"s"(res_o[1]),
//[s_res_o2]"s"(res_o[2]), //[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]), //[s_res_o3]"s"(res_o[3]),
[s_res_b0]"s"(res_b[0]), [s_res_d]"s"(res_b),
[s_res_b1]"s"(res_b[1]), // [s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]), // [s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]), // [s_res_b3]"s"(res_b[3]),
[v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))), [v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
[v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))), [v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
[v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))), [v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
...@@ -242,10 +245,10 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -242,10 +245,10 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
[s_tile_os_b_half]"s"(tile_offset_half_b_bytes), [s_tile_os_b_half]"s"(tile_offset_half_b_bytes),
[s_tile_os_b]"s"(tile_stride_b_bytes), [s_tile_os_b]"s"(tile_stride_b_bytes),
[s_tile_os_dq]"s"(tile_stride_dq_bytes), [s_tile_os_dq]"s"(tile_stride_dq_bytes),
[scale_0]"v"(s0), // [scale_0]"v"(s0),
[scale_1]"v"(s1), // [scale_1]"v"(s1),
[v_nan_lo]"v"(nan_lo), // [v_nan_lo]"v"(nan_lo),
[v_nan_hi]"v"(nan_hi), // [v_nan_hi]"v"(nan_hi),
[s_execflag_0]"s"(o_flags[number<0>{}]), [s_execflag_0]"s"(o_flags[number<0>{}]),
[s_execflag_1]"s"(o_flags[number<1>{}]), [s_execflag_1]"s"(o_flags[number<1>{}]),
[s_execflag_2]"s"(o_flags[number<2>{}]), [s_execflag_2]"s"(o_flags[number<2>{}]),
...@@ -285,21 +288,15 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -285,21 +288,15 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255", "a252", "a253", "a254", "a255",
"s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s6", "s7", "s40", "s41", "s42", "s43", "s44", "s45",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25",
"s26", "s27", "s28", "s29", "s30", "s31", "s32", "s33", "s34", "s35",
"s36", "s37", "s38", "s39", "s40", "s41", "s42", "s43", "s44", "s45",
"s46", "s47", "s48", "s49", "s50", "s51", "s52", "s53", "s54", "s46", "s47", "s48", "s49", "s50", "s51", "s52", "s53", "s54",
"s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63", "s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63",
"s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72", "s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72",
"s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp "s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp
"v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v20", "v21", "v22", "v23", "v24", "v25", "v50", "v51", "v52", "v53", "v54", "v55",
"v29", "v30", "v31", "v32", "v33", "v34", "v35", "v36", "v37", "v56", "v57", "v64",
"v38", "v39", "v40", "v41", "v42", "v43", "v44", "v45", "v46",
"v47", "v48", "v49", "v50", "v51", "v52", "v53", "v54", "v55",
"v56", "v57", "v58", "v59", "v60", "v61", "v62", "v63", "v64",
"v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73", "v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73",
"v74", "v75", "v76", "v77", "v78", "v79", "v80", "v81", "v82", "v74", "v75", "v76", "v77", "v78", "v79", "v80", "v81", "v82",
"v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", "v91", "v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", "v91",
...@@ -364,18 +361,19 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -364,18 +361,19 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// [c30]"+v"(v_c30), // [c30]"+v"(v_c30),
// [c31]"+v"(v_c31) // [c31]"+v"(v_c31)
:[sld_a_base]"n"(0), :[sld_a_base]"n"(0),
[shfl_base]"n"(0), // [shfl_base]"n"(0),
[v_sld_y_os]"v"(sld_y_os), // [v_sld_y_os]"v"(sld_y_os),
[v_sfl_sld]"v"(sfl_sld), // [v_sfl_sld]"v"(sfl_sld),
[v_sfl_sst]"v"(sfl_sst), // [v_sfl_sst]"v"(sfl_sst),
[s_res_dq]"s"(res_dq),
[s_res_o0]"s"(res_o[0]), [s_res_o0]"s"(res_o[0]),
[s_res_o1]"s"(res_o[1]), [s_res_o1]"s"(res_o[1]),
//[s_res_o2]"s"(res_o[2]), //[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]), //[s_res_o3]"s"(res_o[3]),
[s_res_b0]"s"(res_b[0]), [s_res_d]"s"(res_b),
[s_res_b1]"s"(res_b[1]), // [s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]), // [s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]), // [s_res_b3]"s"(res_b[3]),
[v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))), [v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
[v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))), [v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
[v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))), [v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
...@@ -392,10 +390,10 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -392,10 +390,10 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
[s_tile_os_b_half]"s"(tile_offset_half_b_bytes), [s_tile_os_b_half]"s"(tile_offset_half_b_bytes),
[s_tile_os_b]"s"(tile_stride_b_bytes), [s_tile_os_b]"s"(tile_stride_b_bytes),
[s_tile_os_dq]"s"(tile_stride_dq_bytes), [s_tile_os_dq]"s"(tile_stride_dq_bytes),
[scale_0]"v"(s0), // [scale_0]"v"(s0),
[scale_1]"v"(s1), // [scale_1]"v"(s1),
[v_nan_lo]"v"(nan_lo), // [v_nan_lo]"v"(nan_lo),
[v_nan_hi]"v"(nan_hi), // [v_nan_hi]"v"(nan_hi),
[s_execflag_0]"s"(o_flags[number<0>{}]), [s_execflag_0]"s"(o_flags[number<0>{}]),
[s_execflag_1]"s"(o_flags[number<1>{}]), [s_execflag_1]"s"(o_flags[number<1>{}]),
[s_execflag_2]"s"(o_flags[number<2>{}]), [s_execflag_2]"s"(o_flags[number<2>{}]),
...@@ -435,21 +433,15 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -435,21 +433,15 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255", "a252", "a253", "a254", "a255",
"s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s6", "s7", "s40", "s41", "s42", "s43", "s44", "s45",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25",
"s26", "s27", "s28", "s29", "s30", "s31", "s32", "s33", "s34", "s35",
"s36", "s37", "s38", "s39", "s40", "s41", "s42", "s43", "s44", "s45",
"s46", "s47", "s48", "s49", "s50", "s51", "s52", "s53", "s54", "s46", "s47", "s48", "s49", "s50", "s51", "s52", "s53", "s54",
"s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63", "s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63",
"s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72", "s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72",
"s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp "s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp
"v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v20", "v21", "v22", "v23", "v24", "v25", "v50", "v51", "v52", "v53", "v54", "v55",
"v29", "v30", "v31", "v32", "v33", "v34", "v35", "v36", "v37", "v56", "v57", "v64",
"v38", "v39", "v40", "v41", "v42", "v43", "v44", "v45", "v46",
"v47", "v48", "v49", "v50", "v51", "v52", "v53", "v54", "v55",
"v56", "v57", "v58", "v59", "v60", "v61", "v62", "v63", "v64",
"v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73", "v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73",
"v74", "v75", "v76", "v77", "v78", "v79", "v80", "v81", "v82", "v74", "v75", "v76", "v77", "v78", "v79", "v80", "v81", "v82",
"v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", "v91", "v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", "v91",
......
...@@ -160,7 +160,7 @@ ...@@ -160,7 +160,7 @@
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168 \n" " ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168 \n"
" s_mov_b32 s80, 0 \n" " s_mov_b32 s80, 0 \n"
" s_waitcnt vmcnt(24) \n" " s_waitcnt vmcnt(24) \n"
"label_0AA6: \n" "L_start%=: \n"
" s_waitcnt vmcnt(30) & lgkmcnt(0) \n" " s_waitcnt vmcnt(30) & lgkmcnt(0) \n"
" s_barrier \n" " s_barrier \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
...@@ -398,7 +398,7 @@ _UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]") ...@@ -398,7 +398,7 @@ _UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]")
_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]") _UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]")
" s_addk_i32 s80, 0x0080 \n" " s_addk_i32 s80, 0x0080 \n"
" s_cmp_lt_i32 s80, %[s_loop_cnt] \n" " s_cmp_lt_i32 s80, %[s_loop_cnt] \n"
" s_cbranch_scc0 label_0EC1 \n" " s_cbranch_scc0 L_end%= \n"
" s_waitcnt vmcnt(30) & lgkmcnt(0) \n" " s_waitcnt vmcnt(30) & lgkmcnt(0) \n"
" s_barrier \n" " s_barrier \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0 \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0 \n"
...@@ -636,9 +636,9 @@ _UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]") ...@@ -636,9 +636,9 @@ _UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]")
_UK_PK_CVT_("%[c30]", "%[c31]", "%[c23]") _UK_PK_CVT_("%[c30]", "%[c31]", "%[c23]")
" s_addk_i32 s80, 0x0080 \n" " s_addk_i32 s80, 0x0080 \n"
" s_cmp_lt_i32 s80, %[s_loop_cnt] \n" " s_cmp_lt_i32 s80, %[s_loop_cnt] \n"
" s_cbranch_scc0 label_0EC1 \n" " s_cbranch_scc0 L_end%= \n"
" s_branch label_0AA6 \n" " s_branch L_start%= \n"
" label_0EC1: \n" " L_end%=: \n"
" s_waitcnt lgkmcnt(0) \n" " s_waitcnt lgkmcnt(0) \n"
" s_barrier \n" " s_barrier \n"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640 \n" " ds_read_b32 v10, %[v_sfl_sld] offset:16640 \n"
......
...@@ -27,14 +27,8 @@ ...@@ -27,14 +27,8 @@
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16" # define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif #endif
" s_mov_b32 s8, %[s_res_o0] \n"
" s_mov_b32 s9, %[s_res_o1] \n"
" s_mov_b32 s12, %[s_res_b0] \n"
" s_mov_b32 s13, %[s_res_b1] \n"
" s_mov_b32 s14, %[s_res_b2] \n"
" s_mov_b32 s15, %[s_res_b3] \n"
" s_waitcnt vmcnt(24) \n" " s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen\n"
" v_mul_f32 v54, v128, v128 \n" " v_mul_f32 v54, v128, v128 \n"
" v_mul_f32 v55, v129, v129 \n" " v_mul_f32 v55, v129, v129 \n"
" v_mul_f32 v56, v130, v130 \n" " v_mul_f32 v56, v130, v130 \n"
...@@ -55,7 +49,7 @@ ...@@ -55,7 +49,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -68,7 +62,7 @@ ...@@ -68,7 +62,7 @@
" v_mul_f32 v129, v129, v55 \n" " v_mul_f32 v129, v129, v55 \n"
" v_mul_f32 v130, v130, v56 \n" " v_mul_f32 v130, v130, v56 \n"
" v_mul_f32 v131, v131, v57 \n" " v_mul_f32 v131, v131, v57 \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n"
" v_mul_f32 v54, v132, v132 \n" " v_mul_f32 v54, v132, v132 \n"
" v_mul_f32 v55, v133, v133 \n" " v_mul_f32 v55, v133, v133 \n"
" v_mul_f32 v56, v134, v134 \n" " v_mul_f32 v56, v134, v134 \n"
...@@ -89,7 +83,7 @@ ...@@ -89,7 +83,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -102,7 +96,7 @@ ...@@ -102,7 +96,7 @@
" v_mul_f32 v133, v133, v55 \n" " v_mul_f32 v133, v133, v55 \n"
" v_mul_f32 v134, v134, v56 \n" " v_mul_f32 v134, v134, v56 \n"
" v_mul_f32 v135, v135, v57 \n" " v_mul_f32 v135, v135, v57 \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_d], 0 offen\n"
" v_mul_f32 v54, v136, v136 \n" " v_mul_f32 v54, v136, v136 \n"
" v_mul_f32 v55, v137, v137 \n" " v_mul_f32 v55, v137, v137 \n"
" v_mul_f32 v56, v138, v138 \n" " v_mul_f32 v56, v138, v138 \n"
...@@ -123,7 +117,7 @@ ...@@ -123,7 +117,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -136,7 +130,7 @@ ...@@ -136,7 +130,7 @@
" v_mul_f32 v137, v137, v55 \n" " v_mul_f32 v137, v137, v55 \n"
" v_mul_f32 v138, v138, v56 \n" " v_mul_f32 v138, v138, v56 \n"
" v_mul_f32 v139, v139, v57 \n" " v_mul_f32 v139, v139, v57 \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n"
" v_mul_f32 v54, v140, v140 \n" " v_mul_f32 v54, v140, v140 \n"
" v_mul_f32 v55, v141, v141 \n" " v_mul_f32 v55, v141, v141 \n"
" v_mul_f32 v56, v142, v142 \n" " v_mul_f32 v56, v142, v142 \n"
...@@ -157,7 +151,7 @@ ...@@ -157,7 +151,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -171,7 +165,7 @@ ...@@ -171,7 +165,7 @@
" v_mul_f32 v142, v142, v56 \n" " v_mul_f32 v142, v142, v56 \n"
" v_mul_f32 v143, v143, v57 \n" " v_mul_f32 v143, v143, v57 \n"
" s_waitcnt vmcnt(24) \n" " s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_d], 0 offen\n"
" v_mul_f32 v54, v144, v144 \n" " v_mul_f32 v54, v144, v144 \n"
" v_mul_f32 v55, v145, v145 \n" " v_mul_f32 v55, v145, v145 \n"
" v_mul_f32 v56, v146, v146 \n" " v_mul_f32 v56, v146, v146 \n"
...@@ -192,7 +186,7 @@ ...@@ -192,7 +186,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -205,7 +199,7 @@ ...@@ -205,7 +199,7 @@
" v_mul_f32 v145, v145, v55 \n" " v_mul_f32 v145, v145, v55 \n"
" v_mul_f32 v146, v146, v56 \n" " v_mul_f32 v146, v146, v56 \n"
" v_mul_f32 v147, v147, v57 \n" " v_mul_f32 v147, v147, v57 \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n"
" v_mul_f32 v54, v148, v148 \n" " v_mul_f32 v54, v148, v148 \n"
" v_mul_f32 v55, v149, v149 \n" " v_mul_f32 v55, v149, v149 \n"
" v_mul_f32 v56, v150, v150 \n" " v_mul_f32 v56, v150, v150 \n"
...@@ -226,7 +220,7 @@ ...@@ -226,7 +220,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -239,7 +233,7 @@ ...@@ -239,7 +233,7 @@
" v_mul_f32 v149, v149, v55 \n" " v_mul_f32 v149, v149, v55 \n"
" v_mul_f32 v150, v150, v56 \n" " v_mul_f32 v150, v150, v56 \n"
" v_mul_f32 v151, v151, v57 \n" " v_mul_f32 v151, v151, v57 \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_d], 0 offen\n"
" v_mul_f32 v54, v152, v152 \n" " v_mul_f32 v54, v152, v152 \n"
" v_mul_f32 v55, v153, v153 \n" " v_mul_f32 v55, v153, v153 \n"
" v_mul_f32 v56, v154, v154 \n" " v_mul_f32 v56, v154, v154 \n"
...@@ -260,7 +254,7 @@ ...@@ -260,7 +254,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -273,7 +267,7 @@ ...@@ -273,7 +267,7 @@
" v_mul_f32 v153, v153, v55 \n" " v_mul_f32 v153, v153, v55 \n"
" v_mul_f32 v154, v154, v56 \n" " v_mul_f32 v154, v154, v56 \n"
" v_mul_f32 v155, v155, v57 \n" " v_mul_f32 v155, v155, v57 \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n"
" v_mul_f32 v54, v156, v156 \n" " v_mul_f32 v54, v156, v156 \n"
" v_mul_f32 v55, v157, v157 \n" " v_mul_f32 v55, v157, v157 \n"
" v_mul_f32 v56, v158, v158 \n" " v_mul_f32 v56, v158, v158 \n"
...@@ -294,7 +288,7 @@ ...@@ -294,7 +288,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n"
" s_add_u32 s12, %[s_tile_os_b_half], s12 \n" " s_add_u32 s12, %[s_tile_os_b_half], s12 \n"
" s_addc_u32 s13, 0, s13 \n" " s_addc_u32 s13, 0, s13 \n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
...@@ -310,7 +304,7 @@ ...@@ -310,7 +304,7 @@
" v_mul_f32 v158, v158, v56 \n" " v_mul_f32 v158, v158, v56 \n"
" v_mul_f32 v159, v159, v57 \n" " v_mul_f32 v159, v159, v57 \n"
" s_waitcnt vmcnt(24) \n" " s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[64:67], %[v_os_b0], %[s_res_d], 0 offen\n"
" v_mul_f32 v54, v160, v160 \n" " v_mul_f32 v54, v160, v160 \n"
" v_mul_f32 v55, v161, v161 \n" " v_mul_f32 v55, v161, v161 \n"
" v_mul_f32 v56, v162, v162 \n" " v_mul_f32 v56, v162, v162 \n"
...@@ -331,7 +325,7 @@ ...@@ -331,7 +325,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[68:71], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -344,7 +338,7 @@ ...@@ -344,7 +338,7 @@
" v_mul_f32 v161, v161, v55 \n" " v_mul_f32 v161, v161, v55 \n"
" v_mul_f32 v162, v162, v56 \n" " v_mul_f32 v162, v162, v56 \n"
" v_mul_f32 v163, v163, v57 \n" " v_mul_f32 v163, v163, v57 \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[72:75], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n"
" v_mul_f32 v54, v164, v164 \n" " v_mul_f32 v54, v164, v164 \n"
" v_mul_f32 v55, v165, v165 \n" " v_mul_f32 v55, v165, v165 \n"
" v_mul_f32 v56, v166, v166 \n" " v_mul_f32 v56, v166, v166 \n"
...@@ -365,7 +359,7 @@ ...@@ -365,7 +359,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[76:79], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -378,7 +372,7 @@ ...@@ -378,7 +372,7 @@
" v_mul_f32 v165, v165, v55 \n" " v_mul_f32 v165, v165, v55 \n"
" v_mul_f32 v166, v166, v56 \n" " v_mul_f32 v166, v166, v56 \n"
" v_mul_f32 v167, v167, v57 \n" " v_mul_f32 v167, v167, v57 \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[80:83], %[v_os_b1], %[s_res_d], 0 offen\n"
" v_mul_f32 v54, v168, v168 \n" " v_mul_f32 v54, v168, v168 \n"
" v_mul_f32 v55, v169, v169 \n" " v_mul_f32 v55, v169, v169 \n"
" v_mul_f32 v56, v170, v170 \n" " v_mul_f32 v56, v170, v170 \n"
...@@ -399,7 +393,7 @@ ...@@ -399,7 +393,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[84:87], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -412,7 +406,7 @@ ...@@ -412,7 +406,7 @@
" v_mul_f32 v169, v169, v55 \n" " v_mul_f32 v169, v169, v55 \n"
" v_mul_f32 v170, v170, v56 \n" " v_mul_f32 v170, v170, v56 \n"
" v_mul_f32 v171, v171, v57 \n" " v_mul_f32 v171, v171, v57 \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[88:91], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n"
" v_mul_f32 v54, v172, v172 \n" " v_mul_f32 v54, v172, v172 \n"
" v_mul_f32 v55, v173, v173 \n" " v_mul_f32 v55, v173, v173 \n"
" v_mul_f32 v56, v174, v174 \n" " v_mul_f32 v56, v174, v174 \n"
...@@ -433,7 +427,7 @@ ...@@ -433,7 +427,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[92:95], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -447,7 +441,7 @@ ...@@ -447,7 +441,7 @@
" v_mul_f32 v174, v174, v56 \n" " v_mul_f32 v174, v174, v56 \n"
" v_mul_f32 v175, v175, v57 \n" " v_mul_f32 v175, v175, v57 \n"
" s_waitcnt vmcnt(24) \n" " s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[96:99], %[v_os_b2], %[s_res_d], 0 offen\n"
" v_mul_f32 v54, v176, v176 \n" " v_mul_f32 v54, v176, v176 \n"
" v_mul_f32 v55, v177, v177 \n" " v_mul_f32 v55, v177, v177 \n"
" v_mul_f32 v56, v178, v178 \n" " v_mul_f32 v56, v178, v178 \n"
...@@ -468,7 +462,7 @@ ...@@ -468,7 +462,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[100:103], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -481,7 +475,7 @@ ...@@ -481,7 +475,7 @@
" v_mul_f32 v177, v177, v55 \n" " v_mul_f32 v177, v177, v55 \n"
" v_mul_f32 v178, v178, v56 \n" " v_mul_f32 v178, v178, v56 \n"
" v_mul_f32 v179, v179, v57 \n" " v_mul_f32 v179, v179, v57 \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[104:107], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n"
" v_mul_f32 v54, v180, v180 \n" " v_mul_f32 v54, v180, v180 \n"
" v_mul_f32 v55, v181, v181 \n" " v_mul_f32 v55, v181, v181 \n"
" v_mul_f32 v56, v182, v182 \n" " v_mul_f32 v56, v182, v182 \n"
...@@ -502,7 +496,7 @@ ...@@ -502,7 +496,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[108:111], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -515,7 +509,7 @@ ...@@ -515,7 +509,7 @@
" v_mul_f32 v181, v181, v55 \n" " v_mul_f32 v181, v181, v55 \n"
" v_mul_f32 v182, v182, v56 \n" " v_mul_f32 v182, v182, v56 \n"
" v_mul_f32 v183, v183, v57 \n" " v_mul_f32 v183, v183, v57 \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[112:115], %[v_os_b3], %[s_res_d], 0 offen\n"
" v_mul_f32 v54, v184, v184 \n" " v_mul_f32 v54, v184, v184 \n"
" v_mul_f32 v55, v185, v185 \n" " v_mul_f32 v55, v185, v185 \n"
" v_mul_f32 v56, v186, v186 \n" " v_mul_f32 v56, v186, v186 \n"
...@@ -536,7 +530,7 @@ ...@@ -536,7 +530,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[116:119], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -549,7 +543,7 @@ ...@@ -549,7 +543,7 @@
" v_mul_f32 v185, v185, v55 \n" " v_mul_f32 v185, v185, v55 \n"
" v_mul_f32 v186, v186, v56 \n" " v_mul_f32 v186, v186, v56 \n"
" v_mul_f32 v187, v187, v57 \n" " v_mul_f32 v187, v187, v57 \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[120:123], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n"
" v_mul_f32 v54, v188, v188 \n" " v_mul_f32 v54, v188, v188 \n"
" v_mul_f32 v55, v189, v189 \n" " v_mul_f32 v55, v189, v189 \n"
" v_mul_f32 v56, v190, v190 \n" " v_mul_f32 v56, v190, v190 \n"
...@@ -570,7 +564,7 @@ ...@@ -570,7 +564,7 @@
" v_exp_f32 v55, v55 \n" " v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n" " v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n" " v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[124:127], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n" " v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n" " v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n" " v_add_f32 v56, v56, 1.0 \n"
...@@ -647,7 +641,7 @@ ...@@ -647,7 +641,7 @@
" v_mul_f32 v189, v19, v189 row_newbcast:13 \n" " v_mul_f32 v189, v19, v189 row_newbcast:13 \n"
" v_mul_f32 v190, v19, v190 row_newbcast:14 \n" " v_mul_f32 v190, v19, v190 row_newbcast:14 \n"
" v_mul_f32 v191, v19, v191 row_newbcast:15 \n" " v_mul_f32 v191, v19, v191 row_newbcast:15 \n"
" buffer_load_dword v12, v5, s[16:19], 0 offen \n" " buffer_load_dword v12, v5, %[s_res_dq], 0 offen \n"
" v_mov_b32 v22, 0x358637bd \n" " v_mov_b32 v22, 0x358637bd \n"
" v_mov_b32 v23, 0x358637bd \n" " v_mov_b32 v23, 0x358637bd \n"
" v_max3_f32 v22, abs(v128), abs(v129), v22 \n" " v_max3_f32 v22, abs(v128), abs(v129), v22 \n"
...@@ -945,3 +939,4 @@ ...@@ -945,3 +939,4 @@
#undef _UK_PK_CVT_ #undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_ #undef _UK_ATOMIC_ADD_
...@@ -65,88 +65,88 @@ ...@@ -65,88 +65,88 @@
" s_addc_u32 s17, 0, s17 \n" " s_addc_u32 s17, 0, s17 \n"
" s_mov_b32 s80, 0 \n" " s_mov_b32 s80, 0 \n"
" s_waitcnt 0x0000 \n" " s_waitcnt 0x0000 \n"
"label_0C3C: \n" "label_startgemm2: \n"
" s_waitcnt vmcnt(41) \n" " s_waitcnt vmcnt(41) \n"
" s_barrier \n" " s_barrier \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195]\n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[128:131], %[v_os_b0], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195]\n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[132:135], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199]\n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[136:139], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199]\n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[140:143], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203]\n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[144:147], %[v_os_b1], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203]\n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[148:151], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207]\n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[152:155], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207]\n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[156:159], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207]\n"
" s_waitcnt vmcnt(41) \n" " s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211]\n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[160:163], %[v_os_b2], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211]\n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[164:167], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215]\n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[168:171], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215]\n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[172:175], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219]\n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[176:179], %[v_os_b3], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219]\n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[180:183], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223]\n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[184:187], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223]\n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[188:191], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n"
" s_add_u32 s12, %[s_tile_os_b_half], s12 \n" " s_add_u32 s12, %[s_tile_os_b_half], s12 \n"
" s_addc_u32 s13, 0, s13 \n" " s_addc_u32 s13, 0, s13 \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223]\n"
...@@ -154,84 +154,84 @@ ...@@ -154,84 +154,84 @@
" s_waitcnt vmcnt(41) \n" " s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195]\n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b0], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[192:195], %[v_os_b0], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195]\n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b0], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[196:199], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195]\n" " v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199]\n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b0], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[200:203], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199]\n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b0], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[204:207], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199]\n" " v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203]\n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b1], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[208:211], %[v_os_b1], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203]\n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b1], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[212:215], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203]\n" " v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207]\n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b1], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[216:219], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207]\n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b1], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[220:223], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207]\n" " v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207]\n"
" s_waitcnt vmcnt(40) \n" " s_waitcnt vmcnt(40) \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211]\n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b2], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[224:227], %[v_os_b2], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211]\n"
" buffer_load_dword v13, v5, s[16:19], 0 offen \n" " buffer_load_dword v13, v5, %[s_res_dq], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211]\n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b2], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[228:231], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211]\n" " v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215]\n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b2], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[232:235], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215]\n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b2], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[236:239], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215]\n" " v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219]\n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b3], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[240:243], %[v_os_b3], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219]\n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b3], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[244:247], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219]\n" " v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223]\n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b3], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[248:251], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223]\n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b3], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[252:255], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223]\n" " v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223]\n"
" s_add_u32 s60, 0x00000200, s80 \n" " s_add_u32 s60, 0x00000200, s80 \n"
...@@ -511,139 +511,139 @@ ...@@ -511,139 +511,139 @@
" ds_read_b32 v79, v4 offset:48224 \n" " ds_read_b32 v79, v4 offset:48224 \n"
" s_waitcnt lgkmcnt(0) \n" " s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, s[20:21] \n" " s_mov_b64 exec, s[20:21] \n"
" global_atomic_pk_add_bf16 v80, v64, s[8:9] \n" " global_atomic_pk_add_bf16 v80, v64, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[20:21] \n" " s_mov_b64 exec, s[20:21] \n"
" global_atomic_pk_add_bf16 v80, v65, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v80, v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[22:23] \n" " s_mov_b64 exec, s[22:23] \n"
" global_atomic_pk_add_bf16 v82, v66, s[8:9] \n" " global_atomic_pk_add_bf16 v82, v66, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[22:23] \n" " s_mov_b64 exec, s[22:23] \n"
" global_atomic_pk_add_bf16 v82, v67, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v82, v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n" " s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 v84, v68, s[8:9] \n" " global_atomic_pk_add_bf16 v84, v68, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n" " s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 v84, v69, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v84, v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n" " s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 v86, v70, s[8:9] \n" " global_atomic_pk_add_bf16 v86, v70, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n" " s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 v86, v71, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v86, v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n" " s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 v88, v72, s[8:9] \n" " global_atomic_pk_add_bf16 v88, v72, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n" " s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 v88, v73, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v88, v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n" " s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 v90, v74, s[8:9] \n" " global_atomic_pk_add_bf16 v90, v74, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n" " s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 v90, v75, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v90, v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n" " s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 v92, v76, s[8:9] \n" " global_atomic_pk_add_bf16 v92, v76, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n" " s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 v92, v77, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v92, v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n" " s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 v94, v78, s[8:9] \n" " global_atomic_pk_add_bf16 v94, v78, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n" " s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 v94, v79, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v94, v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_add_u32 s8, s59, s8 \n" " s_add_u32 %[s_res_o0], s59, %[s_res_o0] \n"
" s_addc_u32 s9, 0, s9 \n" " s_addc_u32 %[s_res_o1], 0, %[s_res_o1] \n"
" s_addk_i32 s80, 0x0100 \n" " s_addk_i32 s80, 0x0100 \n"
" s_cmp_lt_i32 s80, s81 \n" " s_cmp_lt_i32 s80, s81 \n"
" s_cbranch_scc0 label_2301 \n" " s_cbranch_scc0 label_end_gemm2 \n"
" s_waitcnt vmcnt(41) \n" " s_waitcnt vmcnt(41) \n"
" s_barrier \n" " s_barrier \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227]\n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227]\n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231]\n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231]\n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235]\n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235]\n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239]\n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239]\n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239]\n"
" s_waitcnt vmcnt(41) \n" " s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243]\n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243]\n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247]\n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247]\n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251]\n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251]\n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255]\n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255]\n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n"
" s_add_u32 s12, %[s_tile_os_b_half], s12 \n" " s_add_u32 s12, %[s_tile_os_b_half], s12 \n"
" s_addc_u32 s13, 0, s13 \n" " s_addc_u32 s13, 0, s13 \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255]\n"
...@@ -651,84 +651,84 @@ ...@@ -651,84 +651,84 @@
" s_waitcnt vmcnt(41) \n" " s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227]\n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[64:67], %[v_os_b0], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227]\n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[68:71], %[v_os_b0], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227]\n" " v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231]\n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[72:75], %[v_os_b0], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231]\n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[76:79], %[v_os_b0], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231]\n" " v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235]\n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[80:83], %[v_os_b1], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235]\n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[84:87], %[v_os_b1], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235]\n" " v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239]\n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[88:91], %[v_os_b1], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239]\n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[92:95], %[v_os_b1], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239]\n" " v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239]\n"
" s_waitcnt vmcnt(40) \n" " s_waitcnt vmcnt(40) \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243]\n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[96:99], %[v_os_b2], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243]\n"
" buffer_load_dword v12, v5, s[16:19], 0 offen \n" " buffer_load_dword v12, v5, %[s_res_dq], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243]\n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[100:103], %[v_os_b2], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243]\n" " v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247]\n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[104:107], %[v_os_b2], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247]\n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[108:111], %[v_os_b2], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247]\n" " v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251]\n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen\n" " buffer_load_dwordx4 acc[112:115], %[v_os_b3], %[s_res_d], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251]\n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024\n" " buffer_load_dwordx4 acc[116:119], %[v_os_b3], %[s_res_d], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251]\n" " v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255]\n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048\n" " buffer_load_dwordx4 acc[120:123], %[v_os_b3], %[s_res_d], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255]\n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072\n" " buffer_load_dwordx4 acc[124:127], %[v_os_b3], %[s_res_d], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255]\n" " v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255]\n"
" s_add_u32 s60, 0x00000200, s80 \n" " s_add_u32 s60, 0x00000200, s80 \n"
...@@ -1008,60 +1008,60 @@ ...@@ -1008,60 +1008,60 @@
" ds_read_b32 v79, v4 offset:48224 \n" " ds_read_b32 v79, v4 offset:48224 \n"
" s_waitcnt lgkmcnt(0) \n" " s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, s[20:21] \n" " s_mov_b64 exec, s[20:21] \n"
" global_atomic_pk_add_bf16 v80, v64, s[8:9] \n" " global_atomic_pk_add_bf16 v80, v64, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[20:21] \n" " s_mov_b64 exec, s[20:21] \n"
" global_atomic_pk_add_bf16 v80, v65, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v80, v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[22:23] \n" " s_mov_b64 exec, s[22:23] \n"
" global_atomic_pk_add_bf16 v82, v66, s[8:9] \n" " global_atomic_pk_add_bf16 v82, v66, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[22:23] \n" " s_mov_b64 exec, s[22:23] \n"
" global_atomic_pk_add_bf16 v82, v67, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v82, v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n" " s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 v84, v68, s[8:9] \n" " global_atomic_pk_add_bf16 v84, v68, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n" " s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 v84, v69, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v84, v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n" " s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 v86, v70, s[8:9] \n" " global_atomic_pk_add_bf16 v86, v70, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n" " s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 v86, v71, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v86, v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n" " s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 v88, v72, s[8:9] \n" " global_atomic_pk_add_bf16 v88, v72, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n" " s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 v88, v73, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v88, v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n" " s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 v90, v74, s[8:9] \n" " global_atomic_pk_add_bf16 v90, v74, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n" " s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 v90, v75, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v90, v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n" " s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 v92, v76, s[8:9] \n" " global_atomic_pk_add_bf16 v92, v76, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n" " s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 v92, v77, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v92, v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n" " s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 v94, v78, s[8:9] \n" " global_atomic_pk_add_bf16 v94, v78, [%[s_res_o0],%[s_res_o1]] \n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n" " s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 v94, v79, s[8:9] inst_offset:256\n" " global_atomic_pk_add_bf16 v94, v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n" " s_mov_b64 exec, s[36:37] \n"
" s_add_u32 s8, s59, s8 \n" " s_add_u32 %[s_res_o0], s59, %[s_res_o0] \n"
" s_addc_u32 s9, 0, s9 \n" " s_addc_u32 %[s_res_o1], 0, %[s_res_o1] \n"
" s_addk_i32 s80, 0x0100 \n" " s_addk_i32 s80, 0x0100 \n"
" s_cmp_lt_i32 s80, s81 \n" " s_cmp_lt_i32 s80, s81 \n"
" s_cbranch_scc0 label_2301 \n" " s_cbranch_scc0 label_end_gemm2 \n"
" s_branch label_0C3C \n" " s_branch label_startgemm2 \n"
" label_2301: \n" " label_end_gemm2: \n"
" s_waitcnt 0x0000 \n" " s_waitcnt 0x0000 \n"
" s_endpgm \n" " s_endpgm \n"
#undef _UK_MFMA_ #undef _UK_MFMA_
......
...@@ -18,31 +18,7 @@ ...@@ -18,31 +18,7 @@
" v_mul_f32 a1, v17, a1 row_newbcast:13 \n" \ " v_mul_f32 a1, v17, a1 row_newbcast:13 \n" \
" v_mul_f32 a2, v17, a2 row_newbcast:14 \n" \ " v_mul_f32 a2, v17, a2 row_newbcast:14 \n" \
" v_mul_f32 a3, v17, a3 row_newbcast:15 \n" \ " v_mul_f32 a3, v17, a3 row_newbcast:15 \n" \
";-------------------------------\n"
"s_mov_b32 s28, %[s_res_aq0] \n"
"s_mov_b32 s29, %[s_res_aq1] \n"
"s_mov_b32 s30, %[s_res_aq2] \n"
"s_mov_b32 s31, %[s_res_aq3] \n"
"s_mov_b32 s16, %[s_res_dq0] \n"
"s_mov_b32 s17, %[s_res_dq1] \n"
"s_mov_b32 s18, %[s_res_dq2] \n"
"s_mov_b32 s19, %[s_res_dq3] \n"
"s_mov_b32 s32, %[s_res_gq0] \n"
"s_mov_b32 s33, %[s_res_gq1] \n"
"s_mov_b32 s34, %[s_res_gq2] \n"
"s_mov_b32 s35, %[s_res_gq3] \n"
"s_mov_b32 s36, %[s_res_smq0] \n"
"s_mov_b32 s37, %[s_res_smq1] \n"
"s_mov_b32 s38, %[s_res_smq2] \n"
"s_mov_b32 s39, %[s_res_smq3] \n"
"s_mov_b32 s20, %[s_res_a0] \n"
"s_mov_b32 s21, %[s_res_a1] \n"
"s_mov_b32 s22, %[s_res_a2] \n"
"s_mov_b32 s23, %[s_res_a3] \n"
"s_mov_b32 s24, %[s_res_b0] \n"
"s_mov_b32 s25, %[s_res_b1] \n"
"s_mov_b32 s26, %[s_res_b2] \n"
"s_mov_b32 s27, %[s_res_b3] \n"
";---------------------------------------------- \n" ";---------------------------------------------- \n"
" v_lshrrev_b32 v54, 4, v0 \n" " v_lshrrev_b32 v54, 4, v0 \n"
" v_lshlrev_b32 v55, 2, v54 \n" " v_lshlrev_b32 v55, 2, v54 \n"
...@@ -81,19 +57,19 @@ ...@@ -81,19 +57,19 @@
" v_lshrrev_b32 v54, 24, %[v_token_id0] \n" " v_lshrrev_b32 v54, 24, %[v_token_id0] \n"
" v_mul_i32_i24 v54, s66, v54 \n" " v_mul_i32_i24 v54, s66, v54 \n"
" v_and_b32 v55, 0x00ffffff, %[v_token_id0] \n" " v_and_b32 v55, 0x00ffffff, %[v_token_id0] \n"
" v_add_u32 v6, v54, v55 \n" " v_add_u32 %[v_token_id0], v54, v55 \n"
" v_lshrrev_b32 v54, 24, %[v_token_id1] \n" " v_lshrrev_b32 v54, 24, %[v_token_id1] \n"
" v_mul_i32_i24 v54, s66, v54 \n" " v_mul_i32_i24 v54, s66, v54 \n"
" v_and_b32 v55, 0x00ffffff, %[v_token_id1] \n" " v_and_b32 v55, 0x00ffffff, %[v_token_id1] \n"
" v_add_u32 v7, v54, v55 \n" " v_add_u32 %[v_token_id1], v54, v55 \n"
" v_lshlrev_b32 v6, 2, v6 \n" " v_lshlrev_b32 %[v_token_id0], 2, %[v_token_id0] \n"
" v_lshlrev_b32 v7, 2, v7 \n" " v_lshlrev_b32 %[v_token_id1], 2, %[v_token_id1] \n"
" buffer_load_dword v14, v6, s[28:31], 0 offen \n" " buffer_load_dword v14, %[v_token_id0], %[s_res_aq], 0 offen \n"
" buffer_load_dword v15, v7, s[28:31], 0 offen \n" " buffer_load_dword v15, %[v_token_id1], %[s_res_aq], 0 offen \n"
" buffer_load_dword v16, v10, s[32:35], 0 offen \n" " buffer_load_dword v16, v10, %[s_res_gq], 0 offen \n"
" buffer_load_dword v17, v11, s[32:35], 0 offen \n" " buffer_load_dword v17, v11, %[s_res_gq], 0 offen \n"
" buffer_load_dword v18, v10, s[36:39], 0 offen \n" " buffer_load_dword v18, v10, %[s_res_smq], 0 offen \n"
" buffer_load_dword v19, v11, s[36:39], 0 offen \n" " buffer_load_dword v19, v11, %[s_res_smq], 0 offen \n"
" buffer_load_dword v20, v8, s[40:43], 0 offen \n" " buffer_load_dword v20, v8, s[40:43], 0 offen \n"
" buffer_load_dword v21, v9, s[40:43], 0 offen \n" " buffer_load_dword v21, v9, s[40:43], 0 offen \n"
...@@ -101,76 +77,76 @@ ...@@ -101,76 +77,76 @@
";---------------------------------------------- \n" ";---------------------------------------------- \n"
"; -- prefetch A0\n" "; -- prefetch A0\n"
"s_add_u32 m0, 0, %[s_m0_init] \n" "s_add_u32 m0, 0, %[s_m0_init] \n"
"buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a0], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a1], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a1], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a2], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a2], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a3], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a3], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a4], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a4], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a5], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a5], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a6], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a6], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a7], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a7], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[smem_sz], %[s_m0_init] \n" "s_add_u32 m0, %[smem_sz], %[s_m0_init] \n"
" s_add_u32 s20, s57, s20 \n" " s_add_u32 s20, s57, s20 \n"
" s_addc_u32 s21, 0, s21 \n" " s_addc_u32 s21, 0, s21 \n"
"; -- prefetch A1\n" "; -- prefetch A1\n"
"buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a0], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a1], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a1], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a2], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a2], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a3], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a3], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a4], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a4], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a5], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a5], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a6], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a6], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n" "s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a7], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a7], %[s_res_a], 0 offen lds \n"
"s_add_u32 m0, 0, %[s_m0_init] \n" "s_add_u32 m0, 0, %[s_m0_init] \n"
" s_add_u32 s20, s57, s20 \n" " s_add_u32 s20, s57, s20 \n"
" s_addc_u32 s21, 0, s21 \n" " s_addc_u32 s21, 0, s21 \n"
"; -- prefetch B0\n" "; -- prefetch B0\n"
"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n" "buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024 \n" "buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048 \n" "buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072 \n" "buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen \n" "buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024 \n" "buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048 \n" "buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072 \n" "buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n" "buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n" "buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n" "buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n" "buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n" "buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n" "buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n" "buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n" "buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen \n" "buffer_load_dwordx4 acc[64:67], %[v_os_b4], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024 \n" "buffer_load_dwordx4 acc[68:71], %[v_os_b4], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048 \n" "buffer_load_dwordx4 acc[72:75], %[v_os_b4], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072 \n" "buffer_load_dwordx4 acc[76:79], %[v_os_b4], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen \n" "buffer_load_dwordx4 acc[80:83], %[v_os_b5], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024 \n" "buffer_load_dwordx4 acc[84:87], %[v_os_b5], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048 \n" "buffer_load_dwordx4 acc[88:91], %[v_os_b5], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072 \n" "buffer_load_dwordx4 acc[92:95], %[v_os_b5], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen \n" "buffer_load_dwordx4 acc[96:99], %[v_os_b6], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024 \n" "buffer_load_dwordx4 acc[100:103], %[v_os_b6], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048 \n" "buffer_load_dwordx4 acc[104:107], %[v_os_b6], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072 \n" "buffer_load_dwordx4 acc[108:111], %[v_os_b6], %[s_res_b], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen \n" "buffer_load_dwordx4 acc[112:115], %[v_os_b7], %[s_res_b], 0 offen \n"
"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024 \n" "buffer_load_dwordx4 acc[116:119], %[v_os_b7], %[s_res_b], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048 \n" "buffer_load_dwordx4 acc[120:123], %[v_os_b7], %[s_res_b], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072 \n" "buffer_load_dwordx4 acc[124:127], %[v_os_b7], %[s_res_b], 0 offen offset:3072 \n"
"s_add_u32 s24, s58, s24 \n" "s_add_u32 s24, s58, s24 \n"
"s_addc_u32 s25, 0, s25 \n" "s_addc_u32 s25, 0, s25 \n"
...@@ -191,189 +167,189 @@ ...@@ -191,189 +167,189 @@
" s_barrier \n" " s_barrier \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[192:193], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[192:193], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[194:195], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[194:195], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[128:131], %[v_os_b0], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[196:197], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[196:197], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[198:199], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[198:199], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a0] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[200:201], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[200:201], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[202:203], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[202:203], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[132:135], %[v_os_b0], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[204:205], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[204:205], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[206:207], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[206:207], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a1] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[136:139], %[v_os_b0], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a2] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[140:143], %[v_os_b0], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a3] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[192:193], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[192:193], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[194:195], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[194:195], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[144:147], %[v_os_b1], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[196:197], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[196:197], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[198:199], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[198:199], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a4] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[200:201], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[200:201], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[202:203], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[202:203], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[148:151], %[v_os_b1], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[204:205], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[204:205], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[206:207], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[206:207], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a5] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[152:155], %[v_os_b1], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a6] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[156:159], %[v_os_b1], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a7] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[smem_sz], %[s_m0_init] \n" " s_add_u32 m0, %[smem_sz], %[s_m0_init] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[32:33], v[192:193], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[32:33], v[192:193], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[34:35], v[194:195], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[34:35], v[194:195], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[160:163], %[v_os_b2], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[36:37], v[196:197], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[36:37], v[196:197], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[38:39], v[198:199], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[38:39], v[198:199], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[224:227], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_0] \n" " ds_read_b128 v[224:227], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_0] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[40:41], v[200:201], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[40:41], v[200:201], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[42:43], v[202:203], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[42:43], v[202:203], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[164:167], %[v_os_b2], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[44:45], v[204:205], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[44:45], v[204:205], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[46:47], v[206:207], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[46:47], v[206:207], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[228:231], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_1] \n" " ds_read_b128 v[228:231], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_1] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[32:33], v[208:209], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[32:33], v[208:209], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[34:35], v[210:211], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[34:35], v[210:211], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[168:171], %[v_os_b2], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[36:37], v[212:213], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[36:37], v[212:213], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[38:39], v[214:215], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[38:39], v[214:215], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[232:235], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_2] \n" " ds_read_b128 v[232:235], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_2] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[40:41], v[216:217], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[40:41], v[216:217], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[42:43], v[218:219], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[42:43], v[218:219], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[172:175], %[v_os_b2], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[44:45], v[220:221], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[44:45], v[220:221], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[46:47], v[222:223], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[46:47], v[222:223], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[236:239], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_3] \n" " ds_read_b128 v[236:239], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_3] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[48:49], v[192:193], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[48:49], v[192:193], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[50:51], v[194:195], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[50:51], v[194:195], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[176:179], %[v_os_b3], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[52:53], v[196:197], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[52:53], v[196:197], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[54:55], v[198:199], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[54:55], v[198:199], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[240:243], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_4] \n" " ds_read_b128 v[240:243], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_4] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[56:57], v[200:201], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[56:57], v[200:201], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[58:59], v[202:203], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[58:59], v[202:203], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[180:183], %[v_os_b3], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[60:61], v[204:205], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[60:61], v[204:205], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[62:63], v[206:207], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[62:63], v[206:207], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[244:247], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_5] \n" " ds_read_b128 v[244:247], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_5] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[48:49], v[208:209], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[48:49], v[208:209], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[50:51], v[210:211], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[50:51], v[210:211], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[184:187], %[v_os_b3], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[52:53], v[212:213], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[52:53], v[212:213], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[54:55], v[214:215], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[54:55], v[214:215], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[248:251], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_6] \n" " ds_read_b128 v[248:251], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_6] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[56:57], v[216:217], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[56:57], v[216:217], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[58:59], v[218:219], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[58:59], v[218:219], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[188:191], %[v_os_b3], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[60:61], v[220:221], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[60:61], v[220:221], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[62:63], v[222:223], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[62:63], v[222:223], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[252:255], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_7] \n" " ds_read_b128 v[252:255], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_7] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[64:65], v[192:193], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[64:65], v[192:193], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[66:67], v[194:195], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[66:67], v[194:195], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[192:195], %[v_os_b4], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[68:69], v[196:197], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[68:69], v[196:197], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[70:71], v[198:199], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[70:71], v[198:199], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[72:73], v[200:201], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[72:73], v[200:201], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[74:75], v[202:203], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[74:75], v[202:203], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[196:199], %[v_os_b4], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[76:77], v[204:205], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[76:77], v[204:205], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[78:79], v[206:207], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[78:79], v[206:207], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[64:65], v[208:209], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[64:65], v[208:209], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[66:67], v[210:211], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[66:67], v[210:211], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[200:203], %[v_os_b4], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[68:69], v[212:213], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[68:69], v[212:213], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[70:71], v[214:215], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[70:71], v[214:215], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[72:73], v[216:217], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[72:73], v[216:217], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[74:75], v[218:219], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[74:75], v[218:219], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[204:207], %[v_os_b4], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[76:77], v[220:221], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[76:77], v[220:221], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[78:79], v[222:223], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[78:79], v[222:223], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[80:81], v[192:193], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[80:81], v[192:193], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[82:83], v[194:195], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[82:83], v[194:195], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[208:211], %[v_os_b5], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[84:85], v[196:197], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[84:85], v[196:197], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[86:87], v[198:199], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[86:87], v[198:199], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[88:89], v[200:201], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[88:89], v[200:201], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[90:91], v[202:203], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[90:91], v[202:203], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[212:215], %[v_os_b5], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[92:93], v[204:205], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[92:93], v[204:205], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[94:95], v[206:207], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[94:95], v[206:207], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[80:81], v[208:209], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[80:81], v[208:209], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[82:83], v[210:211], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[82:83], v[210:211], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[216:219], %[v_os_b5], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[84:85], v[212:213], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[84:85], v[212:213], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[86:87], v[214:215], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[86:87], v[214:215], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[88:89], v[216:217], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[88:89], v[216:217], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[90:91], v[218:219], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[90:91], v[218:219], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[220:223], %[v_os_b5], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[92:93], v[220:221], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[92:93], v[220:221], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[94:95], v[222:223], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[94:95], v[222:223], [%[c44], %[c45], %[c46], %[c47]] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[96:97], v[192:193], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[96:97], v[192:193], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[98:99], v[194:195], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[98:99], v[194:195], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[224:227], %[v_os_b6], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[100:101], v[196:197], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[100:101], v[196:197], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[102:103], v[198:199], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[102:103], v[198:199], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[104:105], v[200:201], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[104:105], v[200:201], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[106:107], v[202:203], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[106:107], v[202:203], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[228:231], %[v_os_b6], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[108:109], v[204:205], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[108:109], v[204:205], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[110:111], v[206:207], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[110:111], v[206:207], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[96:97], v[208:209], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[96:97], v[208:209], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[98:99], v[210:211], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[98:99], v[210:211], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[232:235], %[v_os_b6], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[100:101], v[212:213], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[100:101], v[212:213], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[102:103], v[214:215], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[102:103], v[214:215], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[104:105], v[216:217], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[104:105], v[216:217], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[106:107], v[218:219], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[106:107], v[218:219], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[236:239], %[v_os_b6], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[108:109], v[220:221], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[108:109], v[220:221], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[110:111], v[222:223], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[110:111], v[222:223], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[112:113], v[192:193], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[112:113], v[192:193], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[114:115], v[194:195], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[114:115], v[194:195], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[240:243], %[v_os_b7], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[116:117], v[196:197], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[116:117], v[196:197], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[118:119], v[198:199], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[118:119], v[198:199], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[120:121], v[200:201], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[120:121], v[200:201], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[122:123], v[202:203], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[122:123], v[202:203], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[244:247], %[v_os_b7], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[124:125], v[204:205], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[124:125], v[204:205], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[126:127], v[206:207], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[126:127], v[206:207], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[112:113], v[208:209], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[112:113], v[208:209], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[114:115], v[210:211], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[114:115], v[210:211], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[248:251], %[v_os_b7], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[116:117], v[212:213], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[116:117], v[212:213], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[118:119], v[214:215], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[118:119], v[214:215], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[120:121], v[216:217], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[120:121], v[216:217], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[122:123], v[218:219], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[122:123], v[218:219], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[252:255], %[v_os_b7], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[124:125], v[220:221], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[124:125], v[220:221], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[126:127], v[222:223], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[126:127], v[222:223], [%[c60], %[c61], %[c62], %[c63]] \n"
" s_add_u32 s60, 0x00000300, s80 \n" " s_add_u32 s60, 0x00000300, s80 \n"
...@@ -393,189 +369,189 @@ ...@@ -393,189 +369,189 @@
" s_barrier \n" " s_barrier \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[128:129], v[224:225], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[128:129], v[224:225], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[130:131], v[226:227], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[130:131], v[226:227], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[132:133], v[228:229], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[132:133], v[228:229], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[134:135], v[230:231], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[134:135], v[230:231], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a0] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[136:137], v[232:233], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[136:137], v[232:233], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[138:139], v[234:235], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[138:139], v[234:235], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[140:141], v[236:237], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[140:141], v[236:237], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[142:143], v[238:239], [%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[142:143], v[238:239], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a1] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[128:129], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[128:129], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[130:131], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[130:131], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[132:133], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[132:133], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[134:135], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[134:135], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a2] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[136:137], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[136:137], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[138:139], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[138:139], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[140:141], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[140:141], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[142:143], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[142:143], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a3] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[144:145], v[224:225], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[144:145], v[224:225], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[146:147], v[226:227], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[146:147], v[226:227], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[148:149], v[228:229], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[148:149], v[228:229], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[150:151], v[230:231], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[150:151], v[230:231], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a4] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[152:153], v[232:233], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[152:153], v[232:233], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[154:155], v[234:235], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[154:155], v[234:235], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[156:157], v[236:237], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[156:157], v[236:237], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[158:159], v[238:239], [%[c8], %[c9], %[c10], %[c11]] \n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[158:159], v[238:239], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a5] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[144:145], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[144:145], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[146:147], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[146:147], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[148:149], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[148:149], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[150:151], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[150:151], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a6] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n" " s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[152:153], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[152:153], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[154:155], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[154:155], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[156:157], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[156:157], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[158:159], v[254:255], [%[c12], %[c13], %[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[158:159], v[254:255], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dword s[20:23], 0 offen lds \n" " buffer_load_dword %[v_os_a7] %[s_res_a], 0 offen lds \n"
" s_add_u32 m0, 0, %[s_m0_init] \n" " s_add_u32 m0, 0, %[s_m0_init] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[224:225], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[224:225], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[226:227], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[226:227], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[228:229], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[228:229], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[230:231], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[230:231], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0] \n" " ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[232:233], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[232:233], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[234:235], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[234:235], [%[c16], %[c17], %[c18], %[c19]] \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[236:237], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[236:237], [%[c16], %[c17], %[c18], %[c19]] \n"
_UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[238:239], [%[c16], %[c17], %[c18], %[c19]] \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[238:239], [%[c16], %[c17], %[c18], %[c19]] \n"
" ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1] \n" " ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[240:241], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[240:241], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[242:243], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[242:243], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[244:245], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[244:245], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[246:247], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[246:247], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2] \n" " ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[248:249], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[248:249], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[250:251], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[250:251], [%[c20], %[c21], %[c22], %[c23]] \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[252:253], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[252:253], [%[c20], %[c21], %[c22], %[c23]] \n"
_UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[254:255], [%[c20], %[c21], %[c22], %[c23]] \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[254:255], [%[c20], %[c21], %[c22], %[c23]] \n"
" ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3] \n" " ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[224:225], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[224:225], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[226:227], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[226:227], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[228:229], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[228:229], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[230:231], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[230:231], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4] \n" " ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[232:233], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[232:233], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[234:235], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[234:235], [%[c24], %[c25], %[c26], %[c27]] \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[236:237], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[236:237], [%[c24], %[c25], %[c26], %[c27]] \n"
_UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[238:239], [%[c24], %[c25], %[c26], %[c27]] \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[238:239], [%[c24], %[c25], %[c26], %[c27]] \n"
" ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5] \n" " ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[240:241], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[240:241], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[242:243], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[242:243], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[244:245], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[244:245], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[246:247], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[246:247], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6] \n" " ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[248:249], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[248:249], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[250:251], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[250:251], [%[c28], %[c29], %[c30], %[c31]] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[252:253], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[252:253], [%[c28], %[c29], %[c30], %[c31]] \n"
_UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[254:255], [%[c28], %[c29], %[c30], %[c31]] \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[254:255], [%[c28], %[c29], %[c30], %[c31]] \n"
" ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7] \n" " ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[192:193], v[224:225], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[192:193], v[224:225], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[194:195], v[226:227], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[194:195], v[226:227], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[64:67], %[v_os_b4], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[196:197], v[228:229], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[196:197], v[228:229], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[198:199], v[230:231], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[198:199], v[230:231], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[200:201], v[232:233], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[200:201], v[232:233], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[202:203], v[234:235], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[202:203], v[234:235], [%[c32], %[c33], %[c34], %[c35]] \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[68:71], %[v_os_b4], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[204:205], v[236:237], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[204:205], v[236:237], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[206:207], v[238:239], [%[c32], %[c33], %[c34], %[c35]] \n" _UK_MFMA_ " [%[c32], %[c33], %[c34], %[c35]], acc[206:207], v[238:239], [%[c32], %[c33], %[c34], %[c35]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[192:193], v[240:241], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[192:193], v[240:241], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[194:195], v[242:243], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[194:195], v[242:243], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[72:75], %[v_os_b4], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[196:197], v[244:245], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[196:197], v[244:245], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[198:199], v[246:247], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[198:199], v[246:247], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[200:201], v[248:249], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[200:201], v[248:249], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[202:203], v[250:251], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[202:203], v[250:251], [%[c36], %[c37], %[c38], %[c39]] \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[76:79], %[v_os_b4], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[204:205], v[252:253], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[204:205], v[252:253], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[206:207], v[254:255], [%[c36], %[c37], %[c38], %[c39]] \n" _UK_MFMA_ " [%[c36], %[c37], %[c38], %[c39]], acc[206:207], v[254:255], [%[c36], %[c37], %[c38], %[c39]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[208:209], v[224:225], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[208:209], v[224:225], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[210:211], v[226:227], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[210:211], v[226:227], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[80:83], %[v_os_b5], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[212:213], v[228:229], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[212:213], v[228:229], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[214:215], v[230:231], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[214:215], v[230:231], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[216:217], v[232:233], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[216:217], v[232:233], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[218:219], v[234:235], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[218:219], v[234:235], [%[c40], %[c41], %[c42], %[c43]] \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[84:87], %[v_os_b5], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[220:221], v[236:237], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[220:221], v[236:237], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[222:223], v[238:239], [%[c40], %[c41], %[c42], %[c43]] \n" _UK_MFMA_ " [%[c40], %[c41], %[c42], %[c43]], acc[222:223], v[238:239], [%[c40], %[c41], %[c42], %[c43]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[208:209], v[240:241], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[208:209], v[240:241], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[210:211], v[242:243], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[210:211], v[242:243], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[88:91], %[v_os_b5], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[212:213], v[244:245], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[212:213], v[244:245], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[214:215], v[246:247], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[214:215], v[246:247], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[216:217], v[248:249], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[216:217], v[248:249], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[218:219], v[250:251], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[218:219], v[250:251], [%[c44], %[c45], %[c46], %[c47]] \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[92:95], %[v_os_b5], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[220:221], v[252:253], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[220:221], v[252:253], [%[c44], %[c45], %[c46], %[c47]] \n"
_UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[222:223], v[254:255], [%[c44], %[c45], %[c46], %[c47]] \n" _UK_MFMA_ " [%[c44], %[c45], %[c46], %[c47]], acc[222:223], v[254:255], [%[c44], %[c45], %[c46], %[c47]] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[224:225], v[224:225], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[224:225], v[224:225], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[226:227], v[226:227], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[226:227], v[226:227], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[96:99], %[v_os_b6], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[228:229], v[228:229], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[228:229], v[228:229], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[230:231], v[230:231], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[230:231], v[230:231], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[232:233], v[232:233], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[232:233], v[232:233], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[234:235], v[234:235], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[234:235], v[234:235], [%[c48], %[c49], %[c50], %[c51]] \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[100:103], %[v_os_b6], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[236:237], v[236:237], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[236:237], v[236:237], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[238:239], v[238:239], [%[c48], %[c49], %[c50], %[c51]] \n" _UK_MFMA_ " [%[c48], %[c49], %[c50], %[c51]], acc[238:239], v[238:239], [%[c48], %[c49], %[c50], %[c51]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[224:225], v[240:241], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[224:225], v[240:241], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[226:227], v[242:243], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[226:227], v[242:243], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[104:107], %[v_os_b6], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[228:229], v[244:245], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[228:229], v[244:245], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[230:231], v[246:247], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[230:231], v[246:247], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[232:233], v[248:249], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[232:233], v[248:249], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[234:235], v[250:251], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[234:235], v[250:251], [%[c52], %[c53], %[c54], %[c55]] \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[108:111], %[v_os_b6], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[236:237], v[252:253], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[236:237], v[252:253], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[238:239], v[254:255], [%[c52], %[c53], %[c54], %[c55]] \n" _UK_MFMA_ " [%[c52], %[c53], %[c54], %[c55]], acc[238:239], v[254:255], [%[c52], %[c53], %[c54], %[c55]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[240:241], v[224:225], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[240:241], v[224:225], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[242:243], v[226:227], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[242:243], v[226:227], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[112:115], %[v_os_b7], %[s_res_b], 0 offen \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[244:245], v[228:229], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[244:245], v[228:229], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[246:247], v[230:231], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[246:247], v[230:231], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[248:249], v[232:233], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[248:249], v[232:233], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[250:251], v[234:235], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[250:251], v[234:235], [%[c56], %[c57], %[c58], %[c59]] \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[116:119], %[v_os_b7], %[s_res_b], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[252:253], v[236:237], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[252:253], v[236:237], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[254:255], v[238:239], [%[c56], %[c57], %[c58], %[c59]] \n" _UK_MFMA_ " [%[c56], %[c57], %[c58], %[c59]], acc[254:255], v[238:239], [%[c56], %[c57], %[c58], %[c59]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[240:241], v[240:241], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[240:241], v[240:241], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[242:243], v[242:243], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[242:243], v[242:243], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[120:123], %[v_os_b7], %[s_res_b], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[244:245], v[244:245], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[244:245], v[244:245], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[246:247], v[246:247], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[246:247], v[246:247], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[248:249], v[248:249], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[248:249], v[248:249], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[250:251], v[250:251], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[250:251], v[250:251], [%[c60], %[c61], %[c62], %[c63]] \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[124:127], %[v_os_b7], %[s_res_b], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[252:253], v[252:253], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[252:253], v[252:253], [%[c60], %[c61], %[c62], %[c63]] \n"
_UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[254:255], v[254:255], [%[c60], %[c61], %[c62], %[c63]] \n" _UK_MFMA_ " [%[c60], %[c61], %[c62], %[c63]], acc[254:255], v[254:255], [%[c60], %[c61], %[c62], %[c63]] \n"
" s_add_u32 s60, 0x00000300, s80 \n" " s_add_u32 s60, 0x00000300, s80 \n"
...@@ -591,7 +567,7 @@ ...@@ -591,7 +567,7 @@
" s_addk_i32 s80, 0x0100 \n" " s_addk_i32 s80, 0x0100 \n"
" s_cmp_lt_i32 s80, %[s_loop_cnt] \n" " s_cmp_lt_i32 s80, %[s_loop_cnt] \n"
" s_cbranch_scc0 label_end \n" " s_cbranch_scc0 label_end \n"
" s_branch label_start%= \n" " s_branch label_start \n"
" label_end : \n" " label_end : \n"
";---------------------------------------------- \n" ";---------------------------------------------- \n"
" v_cvt_f32_i32 v128, v128 \n" " v_cvt_f32_i32 v128, v128 \n"
...@@ -789,5 +765,3 @@ ...@@ -789,5 +765,3 @@
#undef _UK_MFMA_ #undef _UK_MFMA_
#undef _DEQUAN_CVT_ #undef _DEQUAN_CVT_
...@@ -102,7 +102,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -102,7 +102,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
return MRepeat; return MRepeat;
} }
// TODO: properlly support scatter/gather // TODO: properlly support scatter/gather for load only
CK_TILE_DEVICE auto GetRowCoords_A(index_t base_offset) CK_TILE_DEVICE auto GetRowCoords_A(index_t base_offset)
{ {
constexpr index_t KLans = BlockShape::Block_K0 / kAlignmentA; constexpr index_t KLans = BlockShape::Block_K0 / kAlignmentA;
...@@ -116,6 +116,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -116,6 +116,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
return coords; return coords;
} }
//for mma and A scale
CK_TILE_DEVICE auto GetRowCoords_A_mma(index_t base_offset) CK_TILE_DEVICE auto GetRowCoords_A_mma(index_t base_offset)
{ {
// constexpr index_t KLans = 2; // constexpr index_t KLans = 2;
...@@ -156,6 +157,22 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -156,6 +157,22 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
return w; return w;
} }
template <typename ROW_COORDS>
CK_TILE_DEVICE auto GetAScale(const ROW_IDS row_ids_mma,
const AScaleDataType* a_scale_ptr)
{
constexpr index_t n_size = row_ids_mma.size();
array<TopkWeightDataType, n_size> w;
static_for<0, n_size, 1>{}([&](auto i) {
auto row_id = row_idx_mma[i] & 0xffffff;
auto itp_k = row_idx_mma[i] >> 24;
w.at(i) = sorted_weight_ptr[row_id *kargs.topk+itp_k];
});
return w;
}
// TODO: this row id is before shuffle atomic, need use acc distribution // TODO: this row id is before shuffle atomic, need use acc distribution
CK_TILE_DEVICE auto GetRowCoords_O(index_t base_offset) CK_TILE_DEVICE auto GetRowCoords_O(index_t base_offset)
{ {
...@@ -203,7 +220,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -203,7 +220,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W) BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W)
auto row_coords_a = GetRowCoords_A(sorted_tile_id * BlockShape::Block_M0); auto row_coords_a = GetRowCoords_A(sorted_tile_id * BlockShape::Block_M0);
auto row_coords_a_mma = GetRowCoords_A_mma(sorted_tile_id * BlockShape::Block_M0); auto row_coords_a_mma = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0);
auto row_ids_a = GetRowID( auto row_ids_a = GetRowID(
row_coords_a, reinterpret_cast<const IndexDataType*>(kargs.sorted_token_ids_ptr)); row_coords_a, reinterpret_cast<const IndexDataType*>(kargs.sorted_token_ids_ptr));
auto row_ids_a_mma = GetRowID( auto row_ids_a_mma = GetRowID(
...@@ -221,7 +238,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -221,7 +238,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
//addr in fact //addr in fact
auto a_coords = generate_tuple( auto a_coords = generate_tuple(
[&](auto i) { [&](auto i) {
return (row_ids_a[i]) * kargs.stride_token + return ((row_ids_a[i])&0xffffff) * kargs.stride_token +
threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA; threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA;
}, },
number<row_ids_a.size()>{}); number<row_ids_a.size()>{});
...@@ -231,9 +248,11 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -231,9 +248,11 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
//////aq //////aq
auto aq_win = [&]() { auto aq_win = [&]() {
const AScaleDataType* aq_ptr = reinterpret_cast<const AScaleDataType*>(kargs.a_scale_ptr); const AScaleDataType* aq_ptr = reinterpret_cast<const AScaleDataType*>(kargs.a_scale_ptr);
auto aq_view_ = make_naive_tensor_view_packed<address_space_enum::global>( auto aq_view_ = make_naive_tensor_view<address_space_enum::global>(
aq_ptr, aq_ptr,
make_tuple(kargs.num_tokens * kargs.topk), make_tuple(kargs.num_tokens * kargs.topk),
make_tuple(1),
number<1>{},
number<1>{}); number<1>{});
return aq_view_; return aq_view_;
...@@ -272,9 +291,11 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -272,9 +291,11 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
static_cast<long_index_t>(expert_id) * g_scale_expert_stride_0 + static_cast<long_index_t>(expert_id) * g_scale_expert_stride_0 +
intermediate_tile_id * BlockShape::Block_N0; intermediate_tile_id * BlockShape::Block_N0;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline // const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline
auto gq_view_ = make_naive_tensor_view_packed<address_space_enum::global>( auto gq_view_ = make_naive_tensor_view<address_space_enum::global>(
gq_ptr, gq_ptr,
make_tuple(shared_intermediate_size_1), make_tuple(shared_intermediate_size_1),
make_tuple(1),
number<1>{},
number<1>{}); number<1>{});
return gq_view_; return gq_view_;
...@@ -287,9 +308,11 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -287,9 +308,11 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
static_cast<long_index_t>(expert_id) * smq_scale_expert_stride_0 + static_cast<long_index_t>(expert_id) * smq_scale_expert_stride_0 +
intermediate_tile_id * BlockShape::Block_N0; intermediate_tile_id * BlockShape::Block_N0;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline // const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline
auto smq_view_ = make_naive_tensor_view_packed<address_space_enum::global>( auto smq_view_ = make_naive_tensor_view<address_space_enum::global>(
smq_ptr, smq_ptr,
make_tuple(shared_intermediate_size_1), make_tuple(shared_intermediate_size_1),
make_tuple(1),
number<1>{},
number<1>{}); number<1>{});
return smq_view_; return smq_view_;
...@@ -393,12 +416,14 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -393,12 +416,14 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto row_coords_o = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0); auto row_coords_o = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0);
auto w_scale = GetWeightScale( auto w_scale = GetWeightScale(
row_coords_o, reinterpret_cast<const TopkWeightDataType*>(kargs.sorted_weight_ptr)); row_coords_o, reinterpret_cast<const TopkWeightDataType*>(kargs.sorted_weight_ptr));
auto a_scale = GetAScale(
row_coords_o, reinterpret_cast<const TopkWeightDataType*>(kargs.a_scale_ptr));
auto uk_0 = Policy::template GetUK_0<Problem>(); auto uk_0 = Policy::template GetUK_0<Problem>();
// auto acc_0= uk_0( // auto acc_0= uk_0(
uk_0( uk_0(
row_ids_a_mma,//fake token id, 2D index for X scale row_ids_a_mma,//fake token id, 2D index for X scale
aq_res, a_scale,
dq_res, dq_res,
gq_res, gq_res,
smq_res, smq_res,
...@@ -430,7 +455,8 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -430,7 +455,8 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
// block_sync_lds(); // block_sync_lds();
auto uk_1 = Policy::template GetUK_1<Problem>(); auto uk_1 = Policy::template GetUK_1<Problem>();
uk_1(d_res, uk_1(dq_res,
d_res,
d_coords, d_coords,
o_res, o_res,
o_coords, o_coords,
......
...@@ -17,7 +17,7 @@ fi ...@@ -17,7 +17,7 @@ fi
cmake \ cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ -D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker -save-temps=$PWD" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=ON \ -D BUILD_DEV=ON \
-D GPU_TARGETS=$GPU_TARGETS \ -D GPU_TARGETS=$GPU_TARGETS \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment