Commit a759277d authored by shengnxu's avatar shengnxu
Browse files

fix some error

parent f549173b
...@@ -97,14 +97,14 @@ auto create_args(int argc, char* argv[]) ...@@ -97,14 +97,14 @@ auto create_args(int argc, char* argv[])
.insert("tp", "8", "tensor parallel size") .insert("tp", "8", "tensor parallel size")
.insert("v", "1", "cpu validation or not") .insert("v", "1", "cpu validation or not")
.insert("kname", "1", "print kernel name or not") .insert("kname", "1", "print kernel name or not")
.insert("prec_i", "bf16", "input precision") .insert("prec_i", "int8", "input precision")
.insert("prec_w", "bf16", "weight precision") .insert("prec_w", "int8", "weight precision")
.insert("prec_o", "bf16", "output precision") .insert("prec_o", "bf16", "output precision")
.insert("prec_st", "auto", "token scale data type. auto will set to fp32") .insert("prec_st", "auto", "token scale data type. auto will set to fp32")
.insert("prec_sw", "auto", "weight scale data type. auto will set to fp32") .insert("prec_sw", "auto", "weight scale data type. auto will set to fp32")
.insert("prec_sq", "auto", "(dynamic) smooth quant data type. auto will set to fp32") .insert("prec_sq", "auto", "(dynamic) smooth quant data type. auto will set to fp32")
.insert("prec_kw", "auto", "topk-weight data type. auto will set to fp32") .insert("prec_kw", "auto", "topk-weight data type. auto will set to fp32")
.insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant") .insert("fquant", "1", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
.insert( .insert(
"gate_only", "1", "w0(gate/up) style, 0:gate+up will double interm size, 1:only gate") "gate_only", "1", "w0(gate/up) style, 0:gate+up will double interm size, 1:only gate")
.insert("api", "0", "benchmark api set: 0:fused-moe(moe-gemm+moe-sorting), 1:moe-gemm") .insert("api", "0", "benchmark api set: 0:fused-moe(moe-gemm+moe-sorting), 1:moe-gemm")
...@@ -218,10 +218,15 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -218,10 +218,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile::HostTensor<GDataType> g_host({experts, shared_intermediate_size_0, hidden_size}); ck_tile::HostTensor<GDataType> g_host({experts, shared_intermediate_size_0, hidden_size});
ck_tile::HostTensor<DDataType> d_host({experts, hidden_size, shared_intermediate_size_1}); ck_tile::HostTensor<DDataType> d_host({experts, hidden_size, shared_intermediate_size_1});
ck_tile::HostTensor<ODataType> o_host({tokens, hidden_size}, {stride, 1}); ck_tile::HostTensor<ODataType> o_host({tokens, hidden_size}, {stride, 1});
ck_tile::HostTensor<AScaleDataType> sa_host({tokens}); if (fused_quant == 1)
ck_tile::HostTensor<GScaleDataType> sg_host({shared_intermediate_size_0}); {
ck_tile::HostTensor<DScaleDataType> sd_host({shared_intermediate_size_1}); ck_tile::HostTensor<AScaleDataType> sa_host({tokens, topk});
ck_tile::HostTensor<YSmoothScaleDataType> sy_host({shared_intermediate_size_1}); // smooth-quant } else{
ck_tile::HostTensor<AScaleDataType> sa_host({tokens});
}
ck_tile::HostTensor<GScaleDataType> sg_host({experts, shared_intermediate_size_0});
ck_tile::HostTensor<DScaleDataType> sd_host({experts, shared_intermediate_size_1});
ck_tile::HostTensor<YSmoothScaleDataType> sy_host({experts, shared_intermediate_size_1}); // smooth-quant
ck_tile::HostTensor<IndexDataType> topk_ids_host({tokens, topk}); // to be sort ck_tile::HostTensor<IndexDataType> topk_ids_host({tokens, topk}); // to be sort
ck_tile::HostTensor<TopkWeightDataType> topk_weight_host({tokens, topk}); // to be sort ck_tile::HostTensor<TopkWeightDataType> topk_weight_host({tokens, topk}); // to be sort
...@@ -440,7 +445,8 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -440,7 +445,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
hidden_size, hidden_size,
shared_intermediate_size_0, shared_intermediate_size_0,
topk, topk,
gate_only); gate_only,
fused_quant);
auto o_dev = o_buf.ToHost<ODataType>(); auto o_dev = o_buf.ToHost<ODataType>();
// o_dev.savetxt("gpu-out.txt", "float"); // o_dev.savetxt("gpu-out.txt", "float");
......
...@@ -75,7 +75,8 @@ void reference_fused_moe( ...@@ -75,7 +75,8 @@ void reference_fused_moe(
ck_tile::index_t hidden_size, ck_tile::index_t hidden_size,
ck_tile::index_t intermediate_size, // this size is for gate/up ck_tile::index_t intermediate_size, // this size is for gate/up
ck_tile::index_t topk, ck_tile::index_t topk,
ck_tile::index_t gate_only) ck_tile::index_t gate_only,
ck_tile::index_t fquant)
{ {
assert(sorted_token_ids_host.get_num_of_dimension() == 1); assert(sorted_token_ids_host.get_num_of_dimension() == 1);
assert(sorted_weight_host.get_num_of_dimension() == 1); assert(sorted_weight_host.get_num_of_dimension() == 1);
...@@ -106,22 +107,40 @@ void reference_fused_moe( ...@@ -106,22 +107,40 @@ void reference_fused_moe(
return; return;
ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile]; ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile];
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten]; ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
if(i_token >= tokens) ck_tile::index_t i_weight_idx;
if(fquant == 1)
{
i_weight_idx = i_token >> 24;
i_token = i_token & 0xffffff;
}
if (i_token >= tokens)
return; return;
ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly
auto weight = sorted_weight_host.mData[i_flatten]; auto weight = sorted_weight_host.mData[i_flatten];//top k ratio?
ck_tile::HostTensor<AccDataType> acc_0({1, intermediate_size_0}); ck_tile::HostTensor<float> acc_0({1, intermediate_size_0});
// first gemm // first gemm
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_0; i_n++) for(ck_tile::index_t i_n = 0; i_n < intermediate_size_0; i_n++)
{ {
AccDataType acc = static_cast<AccDataType>(0); AccDataType acc = static_cast<AccDataType>(0);
for(ck_tile::index_t i_k = 0; i_k < hidden_size; i_k++) for(ck_tile::index_t i_k = 0; i_k < hidden_size; i_k++)
{ {
acc += type_convert<AccDataType>(a_host(i_token, i_k)) * acc += type_convert<float>(a_host(i_token, i_k)) *
type_convert<AccDataType>(g_host(i_expert, i_n, i_k)); type_convert<float>(g_host(i_expert, i_n, i_k));
}
if (fquant == 1)
{ //smooth
acc_0(0, i_n) = acc * sa_host(i_token, i_weight_idx) * sg_host(i_expert, i_n);
} else if( fquant == 2 )
{
//dynamic
acc_0(0, i_n) = acc * sa_host(i_token) * sg_host(i_expert, i_n);
}
else
{
//no quant
acc_0(0, i_n) = acc;
} }
acc_0(0, i_n) = acc;
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc); // printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
} }
...@@ -158,10 +177,14 @@ void reference_fused_moe( ...@@ -158,10 +177,14 @@ void reference_fused_moe(
{ {
AccDataType acc = static_cast<AccDataType>(0); AccDataType acc = static_cast<AccDataType>(0);
for(ck_tile::index_t i_k = 0; i_k < intermediate_size_1; i_k++) for(ck_tile::index_t i_k = 0; i_k < intermediate_size_1; i_k++)
{ { if (fquant == 1)
acc += y(0, i_k) * type_convert<AccDataType>(d_host(i_expert, i_n, i_k)); {
acc += y(0, i_k) * sy_host(i_expert, i_k)* type_convert<float>(d_host(i_expert, i_n, i_k));
} else {
acc += y(0, i_k) * type_convert<float>(d_host(i_expert, i_n, i_k));
}
} }
acc_1(0, i_n) = acc * weight; // multiple weight here acc_1(0, i_n) = acc * type_convert<float>(weight); // multiple weight here
} }
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++) for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
...@@ -177,7 +200,7 @@ void reference_fused_moe( ...@@ -177,7 +200,7 @@ void reference_fused_moe(
auto r = [&](auto i_token) { auto r = [&](auto i_token) {
for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++) for(ck_tile::index_t i_n = 0; i_n < hidden_size; i_n++)
{ {
AccDataType acc = type_convert<AccDataType>(0); AccDataType acc = type_convert<float>(0);
for(ck_tile::index_t i_topk = 0; i_topk < topk; i_topk++) for(ck_tile::index_t i_topk = 0; i_topk < topk; i_topk++)
{ {
acc += out_topk_tokens(i_token, i_topk, i_n); acc += out_topk_tokens(i_token, i_topk, i_n);
......
...@@ -4,7 +4,9 @@ ...@@ -4,7 +4,9 @@
#pragma once #pragma once
#include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp" #include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp" #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp" #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp" #include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp" #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
......
...@@ -38,7 +38,7 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_Base // for int8/fp8 ...@@ -38,7 +38,7 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_Base // for int8/fp8
{ {
static constexpr index_t Block_M = 32; static constexpr index_t Block_M = 32;
static constexpr index_t Block_N = 512; static constexpr index_t Block_N = 512;
static constexpr index_t Block_K = 258; static constexpr index_t Block_K = 256;
static constexpr index_t WarpPerBlock_M = 1; static constexpr index_t WarpPerBlock_M = 1;
static constexpr index_t WarpPerBlock_N = 4; static constexpr index_t WarpPerBlock_N = 4;
...@@ -245,14 +245,14 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -245,14 +245,14 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
// TODO: need paired with tile_window_linear! // TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function! // TODO: need call init_raw() before call this function!
template <typename DQRes, typename GQRes, typename SMQRes, typename ARes, typename ACoords, typename BRes, typename BCoords> template <typename AQRes, typename DQRes, typename GQRes, typename SMQRes, typename ARes, typename ACoords, typename BRes, typename BCoords>
CK_TILE_DEVICE auto CK_TILE_DEVICE auto
operator()( index_t row_ids_a_, operator()( index_t row_ids_a_,
const DQes& res_aq const AQRes& res_aq,
const DQes& res_dq, const DQRes& res_dq,
const GQRes& res_gq, const GQRes& res_gq,
const SMQRes& res_smq, const SMQRes& res_smq,
const Res& res_a, const ARes& res_a,
const ACoords& cached_coords_a, const ACoords& cached_coords_a,
const BRes& res_b, const BRes& res_b,
const BCoords& cached_coords_b, const BCoords& cached_coords_b,
...@@ -302,7 +302,8 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -302,7 +302,8 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
}, },
number<a_sld.get_num_of_access()>{}); number<a_sld.get_num_of_access()>{});
index_t loop_cnt = k / Block_K; //index_t loop_cnt = k / Block_K;
index_t loop_cnt = k;
// this is the acc thread buffer // this is the acc thread buffer
register int v_z0 asm("v128") = 0; register int v_z0 asm("v128") = 0;
...@@ -398,7 +399,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -398,7 +399,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
// [v_acc_15]"+v"(v_acc[15]), // [v_acc_15]"+v"(v_acc[15]),
[v_token_id]"+v"(row_ids_a_), [v_token_id]"+v"(row_ids_a_),
[s_mem_]"+r"(smem) [s_mem_]"+r"(smem)
: [s_res_dq0]"s"(res_dq[0]), : [s_res_aq0]"s"(res_aq[0]),
[s_res_aq1]"s"(res_aq[1]),
[s_res_aq2]"s"(res_aq[2]),
[s_res_aq3]"s"(res_aq[3]),
[s_res_dq0]"s"(res_dq[0]),
[s_res_dq1]"s"(res_dq[1]), [s_res_dq1]"s"(res_dq[1]),
[s_res_dq2]"s"(res_dq[2]), [s_res_dq2]"s"(res_dq[2]),
[s_res_dq3]"s"(res_dq[3]), [s_res_dq3]"s"(res_dq[3]),
...@@ -436,10 +441,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -436,10 +441,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))), [v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))), [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[v_os_slda]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))), [v_os_sld]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))),
[s_m0_init]"s"(m0_init_value), [s_m0_init]"s"(m0_init_value),
[s_size_per_issue]"s"(size_per_issue), [s_size_per_issue]"s"(size_per_issue),
[smem_sz]"n"(smem_buf_size), //(smem_buf_size), [smem_sz]"n"(smem_buf_size), //(smem_buf_size),
[s_wave_id]"s"(get_warp_id()),
[sld_os_0]"n"(sld_os[number<0>{}].value), [sld_os_0]"n"(sld_os[number<0>{}].value),
[sld_os_1]"n"(sld_os[number<1>{}].value), [sld_os_1]"n"(sld_os[number<1>{}].value),
[sld_os_2]"n"(sld_os[number<2>{}].value), [sld_os_2]"n"(sld_os[number<2>{}].value),
...@@ -450,7 +456,8 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -450,7 +456,8 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
[sld_os_7]"n"(sld_os[number<7>{}].value), [sld_os_7]"n"(sld_os[number<7>{}].value),
[s_tile_os_a]"s"(tile_offset_a_bytes), [s_tile_os_a]"s"(tile_offset_a_bytes),
[s_tile_os_b]"s"(tile_offset_b_bytes) [s_tile_os_b]"s"(tile_offset_b_bytes)
: "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", :
"memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", "a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", "a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
...@@ -480,84 +487,113 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -480,84 +487,113 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255", "a252", "a253", "a254", "a255",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
"s86", // s86 as tmp "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25",
"v64", "v65", "v66", "v67", "v68", "v69", "s26", "s27", "s28", "s29", "s30", "s31", "s32", "s33", "s34", "s35",
"v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", "s36", "s37", "s38", "s39", "s40", "s41", "s42", "s43", "s44", "s45",
"v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", "s46", "s47", "s48", "s49", "s50", "s51", "s52", "s53", "s54",
"v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", "s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63",
"v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", "s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72",
"v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp
"v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v124", "v125", "v126", "v127" "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
"v29", "v30", "v31", "v32", "v33", "v34", "v35", "v36", "v37",
"v38", "v39", "v40", "v41", "v42", "v43", "v44", "v45", "v46",
"v47", "v48", "v49", "v50", "v51", "v52", "v53", "v54", "v55",
"v56", "v57", "v58", "v59", "v60", "v61", "v62", "v63", "v64",
"v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73",
"v74", "v75", "v76", "v77", "v78", "v79", "v80", "v81", "v82",
"v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", "v91",
"v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", "v100",
"v101", "v102", "v103", "v104", "v105", "v106", "v107", "v108",
"v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116",
"v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124",
"v125", "v126", "v127", "v128", "v129", "v130", "v131", "v132",
"v133", "v134", "v135", "v136", "v137", "v138", "v139", "v140",
"v141", "v142", "v143", "v144", "v145", "v146", "v147", "v148",
"v149", "v150", "v151", "v152", "v153", "v154", "v155", "v156",
"v157", "v158", "v159", "v160", "v161", "v162", "v163", "v164",
"v165", "v166", "v167", "v168", "v169", "v170", "v171", "v172",
"v173", "v174", "v175", "v176", "v177", "v178", "v179", "v180",
"v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188",
"v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196",
"v197", "v198", "v199", "v200", "v201", "v202", "v203", "v204",
"v205", "v206", "v207", "v208", "v209", "v210", "v211", "v212",
"v213", "v214", "v215", "v216", "v217", "v218", "v219", "v220",
"v221", "v222", "v223", "v224", "v225", "v226", "v227", "v228",
"v229", "v230", "v231", "v232", "v233", "v234", "v235", "v236",
"v237", "v238", "v239", "v240", "v241", "v242", "v243", "v244",
"v245", "v246", "v247", "v248", "v249", "v250", "v251", "v252",
"v253", "v254", "v255"
); );
// clang-format on // clang-format on
#pragma clang diagnostic pop #pragma clang diagnostic pop
int32x4_t v_acc[16]{0}; int32x4_t v_acc[16]{0};
v_acc[0][0] = v_z128; v_acc[0][0] = v_z0;
v_acc[0][1] = v_z129; v_acc[0][1] = v_z1;
v_acc[0][2] = v_z130; v_acc[0][2] = v_z2;
v_acc[0][3] = v_z131; v_acc[0][3] = v_z3;
v_acc[1][0] = v_z132; v_acc[1][0] = v_z4;
v_acc[1][1] = v_z133; v_acc[1][1] = v_z5;
v_acc[1][2] = v_z134; v_acc[1][2] = v_z6;
v_acc[1][3] = v_z135; v_acc[1][3] = v_z7;
v_acc[2][0] = v_z136; v_acc[2][0] = v_z8;
v_acc[2][1] = v_z137; v_acc[2][1] = v_z9;
v_acc[2][2] = v_z138; v_acc[2][2] = v_z10;
v_acc[2][3] = v_z139; v_acc[2][3] = v_z11;
v_acc[3][0] = v_z140; v_acc[3][0] = v_z12;
v_acc[3][1] = v_z141; v_acc[3][1] = v_z13;
v_acc[3][2] = v_z142; v_acc[3][2] = v_z14;
v_acc[3][3] = v_z143; v_acc[3][3] = v_z15;
v_acc[4][0] = v_z144; v_acc[4][0] = v_z16;
v_acc[4][1] = v_z145; v_acc[4][1] = v_z17;
v_acc[4][2] = v_z146; v_acc[4][2] = v_z18;
v_acc[4][3] = v_z147; v_acc[4][3] = v_z19;
v_acc[5][0] = v_z148; v_acc[5][0] = v_z20;
v_acc[5][1] = v_z149; v_acc[5][1] = v_z21;
v_acc[5][2] = v_z150; v_acc[5][2] = v_z22;
v_acc[5][3] = v_z151; v_acc[5][3] = v_z23;
v_acc[6][0] = v_z152; v_acc[6][0] = v_z24;
v_acc[6][1] = v_z153; v_acc[6][1] = v_z25;
v_acc[6][2] = v_z154; v_acc[6][2] = v_z26;
v_acc[6][3] = v_z155; v_acc[6][3] = v_z27;
v_acc[7][0] = v_z156; v_acc[7][0] = v_z28;
v_acc[7][1] = v_z157; v_acc[7][1] = v_z29;
v_acc[7][2] = v_z158; v_acc[7][2] = v_z30;
v_acc[7][3] = v_z159; v_acc[7][3] = v_z31;
v_acc[8][0] = v_z160; v_acc[8][0] = v_z32;
v_acc[8][1] = v_z161; v_acc[8][1] = v_z33;
v_acc[8][2] = v_z162; v_acc[8][2] = v_z34;
v_acc[8][3] = v_z163; v_acc[8][3] = v_z35;
v_acc[9][0] = v_z164; v_acc[9][0] = v_z36;
v_acc[9][1] = v_z165; v_acc[9][1] = v_z37;
v_acc[9][2] = v_z166; v_acc[9][2] = v_z38;
v_acc[9][3] = v_z167; v_acc[9][3] = v_z39;
v_acc[10][0] = v_z168; v_acc[10][0] = v_z40;
v_acc[10][1] = v_z169; v_acc[10][1] = v_z41;
v_acc[10][2] = v_z170; v_acc[10][2] = v_z42;
v_acc[10][3] = v_z171; v_acc[10][3] = v_z43;
v_acc[11][0] = v_z172; v_acc[11][0] = v_z44;
v_acc[11][1] = v_z173; v_acc[11][1] = v_z45;
v_acc[11][2] = v_z174; v_acc[11][2] = v_z46;
v_acc[11][3] = v_z175; v_acc[11][3] = v_z47;
v_acc[12][0] = v_z176; v_acc[12][0] = v_z48;
v_acc[12][1] = v_z177; v_acc[12][1] = v_z49;
v_acc[12][2] = v_z178; v_acc[12][2] = v_z50;
v_acc[12][3] = v_z179; v_acc[12][3] = v_z51;
v_acc[13][0] = v_z180; v_acc[13][0] = v_z52;
v_acc[13][1] = v_z181; v_acc[13][1] = v_z53;
v_acc[13][2] = v_z182; v_acc[13][2] = v_z54;
v_acc[13][3] = v_z183; v_acc[13][3] = v_z55;
v_acc[14][0] = v_z184; v_acc[14][0] = v_z56;
v_acc[14][1] = v_z185; v_acc[14][1] = v_z57;
v_acc[14][2] = v_z186; v_acc[14][2] = v_z58;
v_acc[14][3] = v_z187; v_acc[14][3] = v_z59;
v_acc[15][0] = v_z188; v_acc[15][0] = v_z60;
v_acc[15][1] = v_z189; v_acc[15][1] = v_z61;
v_acc[15][2] = v_z190; v_acc[15][2] = v_z62;
v_acc[15][3] = v_z191; v_acc[15][3] = v_z63;
// return local scratch // return local scratch
auto c = MakeCBlockTile(); auto c = MakeCBlockTile();
...@@ -572,191 +608,4 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16 ...@@ -572,191 +608,4 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
} }
}; };
struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base
{
using ADataType = fp16_t;
using BDataType = fp16_t;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
template <typename ARes, typename ACoords, typename BRes, typename BCoords>
CK_TILE_DEVICE auto
operator()(const ARes& res_a,
const ACoords& cached_coords_a,
const BRes& res_b,
const BCoords& cached_coords_b,
CK_TILE_LDS_ADDR void* smem,
index_t k,
index_t tile_offset_a, // for each tile, the offset to move for each unroll
index_t tile_offset_b) // for each tile, the offset to move for each unroll
{
static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8
static_assert(BCoords::size() == Repeat_N);
auto a_sst = make_tile_window(
make_tensor_view<address_space_enum::lds>(
reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsStoreDesc_A()),
MakeLdsStoreDesc_A().get_lengths(),
{0, 0, 0});
auto a_sld = [&]() {
constexpr auto a_warp_enc_ = GetGemm_AWarpEnc();
constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
sequence<WarpPerBlock_N>,
tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_K>>,
tuple<sequence<1, 0>>,
tuple<sequence<1, 0>>,
sequence<1, 2>,
sequence<0, 0>>{};
constexpr auto a_block_dstr_encode =
detail::make_embed_tile_distribution_encoding(a_outer_dstr_enc, a_warp_enc_);
return make_tile_window_linear(
make_tensor_view<address_space_enum::lds>(
reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsLoadDesc_A()),
MakeLdsLoadDesc_A().get_lengths(),
{0, 0},
make_static_tile_distribution(a_block_dstr_encode));
}();
const index_t tile_offset_a_bytes = tile_offset_a * sizeof(ADataType);
const index_t tile_offset_b_bytes = tile_offset_b * sizeof(BDataType);
const auto [m0_init_value, size_per_issue] = get_async_store_smem_info(a_sst);
constexpr auto smem_buf_size =
MakeLdsLoadDesc_A().get_element_space_size() * sizeof(ADataType);
static_assert(a_sld.get_num_of_access() == 8);
constexpr auto sld_os = generate_tuple(
[&](auto i_access) {
return number<a_sld.get_bottom_linear_offset(i_access) * sizeof(ADataType)>{};
},
number<a_sld.get_num_of_access()>{});
index_t loop_cnt = k / Block_K;
// this is the acc thread buffer
fp32x4_t v_acc[16]{.0f};
// B nr->kr
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
// clang-format off
asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA
: [s_loop_cnt]"+s"(loop_cnt),
[v_acc_0]"+v"(v_acc[0]),
[v_acc_1]"+v"(v_acc[1]),
[v_acc_2]"+v"(v_acc[2]),
[v_acc_3]"+v"(v_acc[3]),
[v_acc_4]"+v"(v_acc[4]),
[v_acc_5]"+v"(v_acc[5]),
[v_acc_6]"+v"(v_acc[6]),
[v_acc_7]"+v"(v_acc[7]),
[v_acc_8]"+v"(v_acc[8]),
[v_acc_9]"+v"(v_acc[9]),
[v_acc_10]"+v"(v_acc[10]),
[v_acc_11]"+v"(v_acc[11]),
[v_acc_12]"+v"(v_acc[12]),
[v_acc_13]"+v"(v_acc[13]),
[v_acc_14]"+v"(v_acc[14]),
[v_acc_15]"+v"(v_acc[15]),
[s_mem_]"+r"(smem)
: [s_res_a0]"s"(res_a[0]),
[s_res_a1]"s"(res_a[1]),
[s_res_a2]"s"(res_a[2]),
[s_res_a3]"s"(res_a[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))),
[v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))),
[v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))),
[v_os_a3]"v"(static_cast<index_t>(cached_coords_a[number<3>{}] * sizeof(ADataType))),
[v_os_a4]"v"(static_cast<index_t>(cached_coords_a[number<4>{}] * sizeof(ADataType))),
[v_os_a5]"v"(static_cast<index_t>(cached_coords_a[number<5>{}] * sizeof(ADataType))),
[v_os_a6]"v"(static_cast<index_t>(cached_coords_a[number<6>{}] * sizeof(ADataType))),
[v_os_a7]"v"(static_cast<index_t>(cached_coords_a[number<7>{}] * sizeof(ADataType))),
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[v_os_slda]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))),
[s_m0_init]"s"(m0_init_value),
[s_size_per_issue]"s"(size_per_issue),
[smem_sz]"n"(smem_buf_size), //(smem_buf_size),
[sld_os_0]"n"(sld_os[number<0>{}].value),
[sld_os_1]"n"(sld_os[number<1>{}].value),
[sld_os_2]"n"(sld_os[number<2>{}].value),
[sld_os_3]"n"(sld_os[number<3>{}].value),
[sld_os_4]"n"(sld_os[number<4>{}].value),
[sld_os_5]"n"(sld_os[number<5>{}].value),
[sld_os_6]"n"(sld_os[number<6>{}].value),
[sld_os_7]"n"(sld_os[number<7>{}].value),
[s_tile_os_a]"s"(tile_offset_a_bytes),
[s_tile_os_b]"s"(tile_offset_b_bytes)
: "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
"s86", // s86 as tmp
"v64", "v65", "v66", "v67", "v68", "v69",
"v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79",
"v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89",
"v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99",
"v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
"v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115",
"v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123",
"v124", "v125", "v126", "v127"
);
// clang-format on
#pragma clang diagnostic pop
// return local scratch
auto c = MakeCBlockTile();
for(auto i = 0; i < 16; i++)
{
c.get_thread_buffer()[4 * i + 0] = v_acc[i].x;
c.get_thread_buffer()[4 * i + 1] = v_acc[i].y;
c.get_thread_buffer()[4 * i + 2] = v_acc[i].z;
c.get_thread_buffer()[4 * i + 3] = v_acc[i].w;
}
return c;
}
};
} // namespace ck_tile } // namespace ck_tile
...@@ -73,6 +73,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -73,6 +73,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
{ {
using BDataType = int8_t; using BDataType = int8_t;
using ODataType = int8_t; using ODataType = int8_t;
using DScaleDataType = float_t;
// TODO: need paired with tile_window_linear! // TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function! // TODO: need call init_raw() before call this function!
...@@ -111,38 +112,38 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -111,38 +112,38 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
index_t loop_cnt = n / Block_N; index_t loop_cnt = n / Block_N;
register float v_c0 asm("v64"); // register float v_c0 asm("v64");
register float v_c1 asm("v65"); // register float v_c1 asm("v65");
register float v_c2 asm("v66"); // register float v_c2 asm("v66");
register float v_c3 asm("v67"); // register float v_c3 asm("v67");
register float v_c4 asm("v68"); // register float v_c4 asm("v68");
register float v_c5 asm("v69"); // register float v_c5 asm("v69");
register float v_c6 asm("v70"); // register float v_c6 asm("v70");
register float v_c7 asm("v71"); // register float v_c7 asm("v71");
register float v_c8 asm("v72"); // register float v_c8 asm("v72");
register float v_c9 asm("v73"); // register float v_c9 asm("v73");
register float v_c10 asm("v74"); // register float v_c10 asm("v74");
register float v_c11 asm("v75"); // register float v_c11 asm("v75");
register float v_c12 asm("v76"); // register float v_c12 asm("v76");
register float v_c13 asm("v77"); // register float v_c13 asm("v77");
register float v_c14 asm("v78"); // register float v_c14 asm("v78");
register float v_c15 asm("v79"); // register float v_c15 asm("v79");
register float v_c16 asm("v80"); // register float v_c16 asm("v80");
register float v_c17 asm("v81"); // register float v_c17 asm("v81");
register float v_c18 asm("v82"); // register float v_c18 asm("v82");
register float v_c19 asm("v83"); // register float v_c19 asm("v83");
register float v_c20 asm("v84"); // register float v_c20 asm("v84");
register float v_c21 asm("v85"); // register float v_c21 asm("v85");
register float v_c22 asm("v86"); // register float v_c22 asm("v86");
register float v_c23 asm("v87"); // register float v_c23 asm("v87");
register float v_c24 asm("v88"); // register float v_c24 asm("v88");
register float v_c25 asm("v89"); // register float v_c25 asm("v89");
register float v_c26 asm("v90"); // register float v_c26 asm("v90");
register float v_c27 asm("v91"); // register float v_c27 asm("v91");
register float v_c28 asm("v92"); // register float v_c28 asm("v92");
register float v_c29 asm("v93"); // register float v_c29 asm("v93");
register float v_c30 asm("v94"); // register float v_c30 asm("v94");
register float v_c31 asm("v95"); // register float v_c31 asm("v95");
int32_t nan_hi = 0x7fff0000; int32_t nan_hi = 0x7fff0000;
int32_t nan_lo = 0x00007fff; int32_t nan_lo = 0x00007fff;
...@@ -175,45 +176,44 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -175,45 +176,44 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
#pragma clang diagnostic push #pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm" #pragma clang diagnostic ignored "-Winline-asm"
asm volatile( asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 #define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc" #include "uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc"
#undef CK_TILE_FLATMM_UK_MFMA #undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem), :[smem_]"+r"(smem),
[s_loop_cnt]"+s"(loop_cnt), [s_loop_cnt]"+s"(loop_cnt)
[c0]"+v" (v_c0), // [c0]"+v" (v_c0),
[c1]"+v" (v_c1), // [c1]"+v" (v_c1),
[c2]"+v" (v_c2), // [c2]"+v" (v_c2),
[c3]"+v" (v_c3), // [c3]"+v" (v_c3),
[c4]"+v" (v_c4), // [c4]"+v" (v_c4),
[c5]"+v" (v_c5), // [c5]"+v" (v_c5),
[c6]"+v" (v_c6), // [c6]"+v" (v_c6),
[c7]"+v" (v_c7), // [c7]"+v" (v_c7),
[c8]"+v" (v_c8), // [c8]"+v" (v_c8),
[c9]"+v" (v_c9), // [c9]"+v" (v_c9),
[c10]"+v"(v_c10), // [c10]"+v"(v_c10),
[c11]"+v"(v_c11), // [c11]"+v"(v_c11),
[c12]"+v"(v_c12), // [c12]"+v"(v_c12),
[c13]"+v"(v_c13), // [c13]"+v"(v_c13),
[c14]"+v"(v_c14), // [c14]"+v"(v_c14),
[c15]"+v"(v_c15), // [c15]"+v"(v_c15),
[c16]"+v"(v_c16), // [c16]"+v"(v_c16),
[c17]"+v"(v_c17), // [c17]"+v"(v_c17),
[c18]"+v"(v_c18), // [c18]"+v"(v_c18),
[c19]"+v"(v_c19), // [c19]"+v"(v_c19),
[c20]"+v"(v_c20), // [c20]"+v"(v_c20),
[c21]"+v"(v_c21), // [c21]"+v"(v_c21),
[c22]"+v"(v_c22), // [c22]"+v"(v_c22),
[c23]"+v"(v_c23), // [c23]"+v"(v_c23),
[c24]"+v"(v_c24), // [c24]"+v"(v_c24),
[c25]"+v"(v_c25), // [c25]"+v"(v_c25),
[c26]"+v"(v_c26), // [c26]"+v"(v_c26),
[c27]"+v"(v_c27), // [c27]"+v"(v_c27),
[c28]"+v"(v_c28), // [c28]"+v"(v_c28),
[c29]"+v"(v_c29), // [c29]"+v"(v_c29),
[c30]"+v"(v_c30), // [c30]"+v"(v_c30),
[c31]"+v"(v_c31) // [c31]"+v"(v_c31)
: :[sld_a_base]"n"(0),
[sld_a_base]"n"(0),
[shfl_base]"n"(0), [shfl_base]"n"(0),
[v_sld_y_os]"v"(sld_y_os), [v_sld_y_os]"v"(sld_y_os),
[v_sfl_sld]"v"(sfl_sld), [v_sfl_sld]"v"(sfl_sld),
...@@ -290,177 +290,85 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x ...@@ -290,177 +290,85 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255", "a252", "a253", "a254", "a255",
"s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
"s36", "s37", "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25",
"v50", "v54", "v55", "s26", "s27", "s28", "s29", "s30", "s31", "s32", "s33", "s34", "s35",
"v64","v65","v66","v67","v68","v69","v70","v71", "s36", "s37", "s38", "s39", "s40", "s41", "s42", "s43", "s44", "s45",
"v72","v73","v74","v75","v76","v77","v78","v79", "s46", "s47", "s48", "s49", "s50", "s51", "s52", "s53", "s54",
"v80","v81","v82","v83","v84","v85","v86","v87", "s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63",
"v88","v89","v90","v91","v92","v93","v94","v95", "s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72",
"v128", "v129", "v130", "v131", "s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp
"v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
"v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163", "v29", "v30", "v31", "v32", "v33", "v34", "v35", "v36", "v37",
"v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171", "v38", "v39", "v40", "v41", "v42", "v43", "v44", "v45", "v46",
"v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", "v47", "v48", "v49", "v50", "v51", "v52", "v53", "v54", "v55",
"v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v56", "v57", "v58", "v59", "v60", "v61", "v62", "v63", "v64",
"v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73",
"v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203", "v74", "v75", "v76", "v77", "v78", "v79", "v80", "v81", "v82",
"v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211", "v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", "v91",
"v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", "v100",
"v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227", "v101", "v102", "v103", "v104", "v105", "v106", "v107", "v108",
"v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116",
"v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124",
"v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", "v125", "v126", "v127", "v128", "v129", "v130", "v131", "v132",
"v252", "v253", "v254", "v255" "v133", "v134", "v135", "v136", "v137", "v138", "v139", "v140",
"v141", "v142", "v143", "v144", "v145", "v146", "v147", "v148",
"v149", "v150", "v151", "v152", "v153", "v154", "v155", "v156",
"v157", "v158", "v159", "v160", "v161", "v162", "v163", "v164",
"v165", "v166", "v167", "v168", "v169", "v170", "v171", "v172",
"v173", "v174", "v175", "v176", "v177", "v178", "v179", "v180",
"v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188",
"v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196",
"v197", "v198", "v199", "v200", "v201", "v202", "v203", "v204",
"v205", "v206", "v207", "v208", "v209", "v210", "v211", "v212",
"v213", "v214", "v215", "v216", "v217", "v218", "v219", "v220",
"v221", "v222", "v223", "v224", "v225", "v226", "v227", "v228",
"v229", "v230", "v231", "v232", "v233", "v234", "v235", "v236",
"v237", "v238", "v239", "v240", "v241", "v242", "v243", "v244",
"v245", "v246", "v247", "v248", "v249", "v250", "v251", "v252",
"v253", "v254", "v255"
); );
#pragma clang diagnostic pop
// clang-format on
}
};
struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
{
using BDataType = bf16_t;
using ODataType = bf16_t;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template <typename BRes,
typename BCoords,
typename ORes,
typename OCoords,
typename OFlags,
typename ScaleTensor>
CK_TILE_DEVICE auto
operator()(const BRes& res_b,
const BCoords& cached_coords_b,
const ORes& res_o,
const OCoords& cached_coords_o,
const OFlags& o_flags, // this should be in sgpr
CK_TILE_LDS_ADDR void* smem,
index_t n, // loop along n dim
const ScaleTensor& scale_,
index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
index_t tile_offset_o)
{
static_assert(BCoords::size() == 8); // 8
static_assert(OCoords::size() == 8);
const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
static_assert(ScaleTensor::size() == 2);
float s0 = scale_[number<0>{}];
float s1 = scale_[number<1>{}];
index_t loop_cnt = n / Block_N;
register float v_c0 asm("v64");
register float v_c1 asm("v65");
register float v_c2 asm("v66");
register float v_c3 asm("v67");
register float v_c4 asm("v68");
register float v_c5 asm("v69");
register float v_c6 asm("v70");
register float v_c7 asm("v71");
register float v_c8 asm("v72");
register float v_c9 asm("v73");
register float v_c10 asm("v74");
register float v_c11 asm("v75");
register float v_c12 asm("v76");
register float v_c13 asm("v77");
register float v_c14 asm("v78");
register float v_c15 asm("v79");
register float v_c16 asm("v80");
register float v_c17 asm("v81");
register float v_c18 asm("v82");
register float v_c19 asm("v83");
register float v_c20 asm("v84");
register float v_c21 asm("v85");
register float v_c22 asm("v86");
register float v_c23 asm("v87");
register float v_c24 asm("v88");
register float v_c25 asm("v89");
register float v_c26 asm("v90");
register float v_c27 asm("v91");
register float v_c28 asm("v92");
register float v_c29 asm("v93");
register float v_c30 asm("v94");
register float v_c31 asm("v95");
int32_t nan_hi = 0x7fff0000;
int32_t nan_lo = 0x00007fff;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int lane_id = threadIdx.x % 64;
int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
sld_y_os *= 2;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
sfl_sst *= 2;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
sfl_sld *= 2;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm volatile( asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16 #define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc" #include "uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc"
#undef CK_TILE_FLATMM_UK_MFMA #undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem), :[smem_]"+r"(smem),
[s_loop_cnt]"+s"(loop_cnt), [s_loop_cnt]"+s"(loop_cnt)
[c0]"+v" (v_c0), // [c0]"+v" (v_c0),
[c1]"+v" (v_c1), // [c1]"+v" (v_c1),
[c2]"+v" (v_c2), // [c2]"+v" (v_c2),
[c3]"+v" (v_c3), // [c3]"+v" (v_c3),
[c4]"+v" (v_c4), // [c4]"+v" (v_c4),
[c5]"+v" (v_c5), // [c5]"+v" (v_c5),
[c6]"+v" (v_c6), // [c6]"+v" (v_c6),
[c7]"+v" (v_c7), // [c7]"+v" (v_c7),
[c8]"+v" (v_c8), // [c8]"+v" (v_c8),
[c9]"+v" (v_c9), // [c9]"+v" (v_c9),
[c10]"+v"(v_c10), // [c10]"+v"(v_c10),
[c11]"+v"(v_c11), // [c11]"+v"(v_c11),
[c12]"+v"(v_c12), // [c12]"+v"(v_c12),
[c13]"+v"(v_c13), // [c13]"+v"(v_c13),
[c14]"+v"(v_c14), // [c14]"+v"(v_c14),
[c15]"+v"(v_c15), // [c15]"+v"(v_c15),
[c16]"+v"(v_c16), // [c16]"+v"(v_c16),
[c17]"+v"(v_c17), // [c17]"+v"(v_c17),
[c18]"+v"(v_c18), // [c18]"+v"(v_c18),
[c19]"+v"(v_c19), // [c19]"+v"(v_c19),
[c20]"+v"(v_c20), // [c20]"+v"(v_c20),
[c21]"+v"(v_c21), // [c21]"+v"(v_c21),
[c22]"+v"(v_c22), // [c22]"+v"(v_c22),
[c23]"+v"(v_c23), // [c23]"+v"(v_c23),
[c24]"+v"(v_c24), // [c24]"+v"(v_c24),
[c25]"+v"(v_c25), // [c25]"+v"(v_c25),
[c26]"+v"(v_c26), // [c26]"+v"(v_c26),
[c27]"+v"(v_c27), // [c27]"+v"(v_c27),
[c28]"+v"(v_c28), // [c28]"+v"(v_c28),
[c29]"+v"(v_c29), // [c29]"+v"(v_c29),
[c30]"+v"(v_c30), // [c30]"+v"(v_c30),
[c31]"+v"(v_c31) // [c31]"+v"(v_c31)
: :[sld_a_base]"n"(0),
[sld_a_base]"n"(0),
[shfl_base]"n"(0), [shfl_base]"n"(0),
[v_sld_y_os]"v"(sld_y_os), [v_sld_y_os]"v"(sld_y_os),
[v_sfl_sld]"v"(sfl_sld), [v_sfl_sld]"v"(sfl_sld),
...@@ -491,7 +399,9 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x ...@@ -491,7 +399,9 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))), [v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[s_tile_os_o]"s"(tile_stride_o_bytes), [s_tile_os_o]"s"(tile_stride_o_bytes),
[s_tile_os_b_half]"s"(tile_offset_half_b_bytes),
[s_tile_os_b]"s"(tile_stride_b_bytes), [s_tile_os_b]"s"(tile_stride_b_bytes),
[s_tile_os_dq]"s"(tile_stride_dq_bytes),
[scale_0]"v"(s0), [scale_0]"v"(s0),
[scale_1]"v"(s1), [scale_1]"v"(s1),
[v_nan_lo]"v"(nan_lo), [v_nan_lo]"v"(nan_lo),
...@@ -535,30 +445,45 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x ...@@ -535,30 +445,45 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", "a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255", "a252", "a253", "a254", "a255",
"s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
"s36", "s37", "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25",
"v50", "v54", "v55", "s26", "s27", "s28", "s29", "s30", "s31", "s32", "s33", "s34", "s35",
"v64","v65","v66","v67","v68","v69","v70","v71", "s36", "s37", "s38", "s39", "s40", "s41", "s42", "s43", "s44", "s45",
"v72","v73","v74","v75","v76","v77","v78","v79", "s46", "s47", "s48", "s49", "s50", "s51", "s52", "s53", "s54",
"v80","v81","v82","v83","v84","v85","v86","v87", "s55", "s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63",
"v88","v89","v90","v91","v92","v93","v94","v95", "s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71", "s72",
"v128", "v129", "v130", "v131", "s73", "s74", "s75", "s76", "s77", "s78", "s79", "s80", // s86 as tmp
"v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
"v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163", "v29", "v30", "v31", "v32", "v33", "v34", "v35", "v36", "v37",
"v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171", "v38", "v39", "v40", "v41", "v42", "v43", "v44", "v45", "v46",
"v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", "v47", "v48", "v49", "v50", "v51", "v52", "v53", "v54", "v55",
"v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v56", "v57", "v58", "v59", "v60", "v61", "v62", "v63", "v64",
"v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v65", "v66", "v67", "v68", "v69", "v70", "v71", "v72", "v73",
"v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203", "v74", "v75", "v76", "v77", "v78", "v79", "v80", "v81", "v82",
"v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211", "v83", "v84", "v85", "v86", "v87", "v88", "v89", "v90", "v91",
"v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", "v100",
"v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227", "v101", "v102", "v103", "v104", "v105", "v106", "v107", "v108",
"v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116",
"v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124",
"v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", "v125", "v126", "v127", "v128", "v129", "v130", "v131", "v132",
"v252", "v253", "v254", "v255" "v133", "v134", "v135", "v136", "v137", "v138", "v139", "v140",
"v141", "v142", "v143", "v144", "v145", "v146", "v147", "v148",
"v149", "v150", "v151", "v152", "v153", "v154", "v155", "v156",
"v157", "v158", "v159", "v160", "v161", "v162", "v163", "v164",
"v165", "v166", "v167", "v168", "v169", "v170", "v171", "v172",
"v173", "v174", "v175", "v176", "v177", "v178", "v179", "v180",
"v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188",
"v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196",
"v197", "v198", "v199", "v200", "v201", "v202", "v203", "v204",
"v205", "v206", "v207", "v208", "v209", "v210", "v211", "v212",
"v213", "v214", "v215", "v216", "v217", "v218", "v219", "v220",
"v221", "v222", "v223", "v224", "v225", "v226", "v227", "v228",
"v229", "v230", "v231", "v232", "v233", "v234", "v235", "v236",
"v237", "v238", "v239", "v240", "v241", "v242", "v243", "v244",
"v245", "v246", "v247", "v248", "v249", "v250", "v251", "v252",
"v253", "v254", "v255"
); );
#pragma clang diagnostic pop #pragma clang diagnostic pop
// clang-format on // clang-format on
......
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_INT8
# define _UK_MFMA_ "v_mfma_i32_16x16x32_i8"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \
" v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
" v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \
" v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
" v_perm_b32 " y_ ", v55, v54, s52 \n"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, " x0_ " \n" \
" v_cvt_f16_f32 v55, " x1_ " \n" \
" v_pack_b32_f16 " y_ ", v54, v55 \n"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
" s_mov_b32 s8, %[s_res_o0] \n"
" s_mov_b32 s9, %[s_res_o1] \n"
" s_mov_b32 s12, %[s_res_b0] \n"
" s_mov_b32 s13, %[s_res_b1] \n"
" s_mov_b32 s14, %[s_res_b2] \n"
" s_mov_b32 s15, %[s_res_b3] \n"
" s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen\n"
" v_mul_f32 v54, v128, v128 \n"
" v_mul_f32 v55, v129, v129 \n"
" v_mul_f32 v56, v130, v130 \n"
" v_mul_f32 v57, v131, v131 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v128 \n"
" v_mul_f32 v55, v55, v129 \n"
" v_mul_f32 v56, v56, v130 \n"
" v_mul_f32 v57, v57, v131 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v128, v128, v54 \n"
" v_mul_f32 v129, v129, v55 \n"
" v_mul_f32 v130, v130, v56 \n"
" v_mul_f32 v131, v131, v57 \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v132, v132 \n"
" v_mul_f32 v55, v133, v133 \n"
" v_mul_f32 v56, v134, v134 \n"
" v_mul_f32 v57, v135, v135 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v132 \n"
" v_mul_f32 v55, v55, v133 \n"
" v_mul_f32 v56, v56, v134 \n"
" v_mul_f32 v57, v57, v135 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v132, v132, v54 \n"
" v_mul_f32 v133, v133, v55 \n"
" v_mul_f32 v134, v134, v56 \n"
" v_mul_f32 v135, v135, v57 \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen\n"
" v_mul_f32 v54, v136, v136 \n"
" v_mul_f32 v55, v137, v137 \n"
" v_mul_f32 v56, v138, v138 \n"
" v_mul_f32 v57, v139, v139 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v136 \n"
" v_mul_f32 v55, v55, v137 \n"
" v_mul_f32 v56, v56, v138 \n"
" v_mul_f32 v57, v57, v139 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v136, v136, v54 \n"
" v_mul_f32 v137, v137, v55 \n"
" v_mul_f32 v138, v138, v56 \n"
" v_mul_f32 v139, v139, v57 \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v140, v140 \n"
" v_mul_f32 v55, v141, v141 \n"
" v_mul_f32 v56, v142, v142 \n"
" v_mul_f32 v57, v143, v143 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v140 \n"
" v_mul_f32 v55, v55, v141 \n"
" v_mul_f32 v56, v56, v142 \n"
" v_mul_f32 v57, v57, v143 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v140, v140, v54 \n"
" v_mul_f32 v141, v141, v55 \n"
" v_mul_f32 v142, v142, v56 \n"
" v_mul_f32 v143, v143, v57 \n"
" s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen\n"
" v_mul_f32 v54, v144, v144 \n"
" v_mul_f32 v55, v145, v145 \n"
" v_mul_f32 v56, v146, v146 \n"
" v_mul_f32 v57, v147, v147 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v144 \n"
" v_mul_f32 v55, v55, v145 \n"
" v_mul_f32 v56, v56, v146 \n"
" v_mul_f32 v57, v57, v147 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v144, v144, v54 \n"
" v_mul_f32 v145, v145, v55 \n"
" v_mul_f32 v146, v146, v56 \n"
" v_mul_f32 v147, v147, v57 \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v148, v148 \n"
" v_mul_f32 v55, v149, v149 \n"
" v_mul_f32 v56, v150, v150 \n"
" v_mul_f32 v57, v151, v151 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v148 \n"
" v_mul_f32 v55, v55, v149 \n"
" v_mul_f32 v56, v56, v150 \n"
" v_mul_f32 v57, v57, v151 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v148, v148, v54 \n"
" v_mul_f32 v149, v149, v55 \n"
" v_mul_f32 v150, v150, v56 \n"
" v_mul_f32 v151, v151, v57 \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen\n"
" v_mul_f32 v54, v152, v152 \n"
" v_mul_f32 v55, v153, v153 \n"
" v_mul_f32 v56, v154, v154 \n"
" v_mul_f32 v57, v155, v155 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v152 \n"
" v_mul_f32 v55, v55, v153 \n"
" v_mul_f32 v56, v56, v154 \n"
" v_mul_f32 v57, v57, v155 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v152, v152, v54 \n"
" v_mul_f32 v153, v153, v55 \n"
" v_mul_f32 v154, v154, v56 \n"
" v_mul_f32 v155, v155, v57 \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v156, v156 \n"
" v_mul_f32 v55, v157, v157 \n"
" v_mul_f32 v56, v158, v158 \n"
" v_mul_f32 v57, v159, v159 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v156 \n"
" v_mul_f32 v55, v55, v157 \n"
" v_mul_f32 v56, v56, v158 \n"
" v_mul_f32 v57, v57, v159 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
" s_add_u32 s12, %[s_tile_os_b_half], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v156, v156, v54 \n"
" v_mul_f32 v157, v157, v55 \n"
" v_mul_f32 v158, v158, v56 \n"
" v_mul_f32 v159, v159, v57 \n"
" s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen\n"
" v_mul_f32 v54, v160, v160 \n"
" v_mul_f32 v55, v161, v161 \n"
" v_mul_f32 v56, v162, v162 \n"
" v_mul_f32 v57, v163, v163 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v160 \n"
" v_mul_f32 v55, v55, v161 \n"
" v_mul_f32 v56, v56, v162 \n"
" v_mul_f32 v57, v57, v163 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v160, v160, v54 \n"
" v_mul_f32 v161, v161, v55 \n"
" v_mul_f32 v162, v162, v56 \n"
" v_mul_f32 v163, v163, v57 \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v164, v164 \n"
" v_mul_f32 v55, v165, v165 \n"
" v_mul_f32 v56, v166, v166 \n"
" v_mul_f32 v57, v167, v167 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v164 \n"
" v_mul_f32 v55, v55, v165 \n"
" v_mul_f32 v56, v56, v166 \n"
" v_mul_f32 v57, v57, v167 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v164, v164, v54 \n"
" v_mul_f32 v165, v165, v55 \n"
" v_mul_f32 v166, v166, v56 \n"
" v_mul_f32 v167, v167, v57 \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen\n"
" v_mul_f32 v54, v168, v168 \n"
" v_mul_f32 v55, v169, v169 \n"
" v_mul_f32 v56, v170, v170 \n"
" v_mul_f32 v57, v171, v171 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v168 \n"
" v_mul_f32 v55, v55, v169 \n"
" v_mul_f32 v56, v56, v170 \n"
" v_mul_f32 v57, v57, v171 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v168, v168, v54 \n"
" v_mul_f32 v169, v169, v55 \n"
" v_mul_f32 v170, v170, v56 \n"
" v_mul_f32 v171, v171, v57 \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v172, v172 \n"
" v_mul_f32 v55, v173, v173 \n"
" v_mul_f32 v56, v174, v174 \n"
" v_mul_f32 v57, v175, v175 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v172 \n"
" v_mul_f32 v55, v55, v173 \n"
" v_mul_f32 v56, v56, v174 \n"
" v_mul_f32 v57, v57, v175 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v172, v172, v54 \n"
" v_mul_f32 v173, v173, v55 \n"
" v_mul_f32 v174, v174, v56 \n"
" v_mul_f32 v175, v175, v57 \n"
" s_waitcnt vmcnt(24) \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen\n"
" v_mul_f32 v54, v176, v176 \n"
" v_mul_f32 v55, v177, v177 \n"
" v_mul_f32 v56, v178, v178 \n"
" v_mul_f32 v57, v179, v179 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v176 \n"
" v_mul_f32 v55, v55, v177 \n"
" v_mul_f32 v56, v56, v178 \n"
" v_mul_f32 v57, v57, v179 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v176, v176, v54 \n"
" v_mul_f32 v177, v177, v55 \n"
" v_mul_f32 v178, v178, v56 \n"
" v_mul_f32 v179, v179, v57 \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v180, v180 \n"
" v_mul_f32 v55, v181, v181 \n"
" v_mul_f32 v56, v182, v182 \n"
" v_mul_f32 v57, v183, v183 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v180 \n"
" v_mul_f32 v55, v55, v181 \n"
" v_mul_f32 v56, v56, v182 \n"
" v_mul_f32 v57, v57, v183 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v180, v180, v54 \n"
" v_mul_f32 v181, v181, v55 \n"
" v_mul_f32 v182, v182, v56 \n"
" v_mul_f32 v183, v183, v57 \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen\n"
" v_mul_f32 v54, v184, v184 \n"
" v_mul_f32 v55, v185, v185 \n"
" v_mul_f32 v56, v186, v186 \n"
" v_mul_f32 v57, v187, v187 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v184 \n"
" v_mul_f32 v55, v55, v185 \n"
" v_mul_f32 v56, v56, v186 \n"
" v_mul_f32 v57, v57, v187 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v184, v184, v54 \n"
" v_mul_f32 v185, v185, v55 \n"
" v_mul_f32 v186, v186, v56 \n"
" v_mul_f32 v187, v187, v57 \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
" v_mul_f32 v54, v188, v188 \n"
" v_mul_f32 v55, v189, v189 \n"
" v_mul_f32 v56, v190, v190 \n"
" v_mul_f32 v57, v191, v191 \n"
" v_fma_f32 v54, v54, s77, v1 \n"
" v_fma_f32 v55, v55, s77, v1 \n"
" v_fma_f32 v56, v56, s77, v1 \n"
" v_fma_f32 v57, v57, s77, v1 \n"
" v_mul_f32 v54, v54, v188 \n"
" v_mul_f32 v55, v55, v189 \n"
" v_mul_f32 v56, v56, v190 \n"
" v_mul_f32 v57, v57, v191 \n"
" v_mul_f32 v54, v54, s6 \n"
" v_mul_f32 v55, v55, s6 \n"
" v_mul_f32 v56, v56, s6 \n"
" v_mul_f32 v57, v57, s6 \n"
" v_exp_f32 v54, v54 \n"
" v_exp_f32 v55, v55 \n"
" v_exp_f32 v56, v56 \n"
" v_exp_f32 v57, v57 \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
" v_add_f32 v54, v54, 1.0 \n"
" v_add_f32 v55, v55, 1.0 \n"
" v_add_f32 v56, v56, 1.0 \n"
" v_add_f32 v57, v57, 1.0 \n"
" v_rcp_f32 v54, v54 \n"
" v_rcp_f32 v55, v55 \n"
" v_rcp_f32 v56, v56 \n"
" v_rcp_f32 v57, v57 \n"
" v_mul_f32 v188, v188, v54 \n"
" v_mul_f32 v189, v189, v55 \n"
" v_mul_f32 v190, v190, v56 \n"
" v_mul_f32 v191, v191, v57 \n"
" v_mul_f32 v128, v18, v128 row_newbcast:0 \n"
" v_mul_f32 v129, v18, v129 row_newbcast:1 \n"
" v_mul_f32 v130, v18, v130 row_newbcast:2 \n"
" v_mul_f32 v131, v18, v131 row_newbcast:3 \n"
" v_mul_f32 v132, v18, v132 row_newbcast:0 \n"
" v_mul_f32 v133, v18, v133 row_newbcast:1 \n"
" v_mul_f32 v134, v18, v134 row_newbcast:2 \n"
" v_mul_f32 v135, v18, v135 row_newbcast:3 \n"
" v_mul_f32 v136, v18, v136 row_newbcast:4 \n"
" v_mul_f32 v137, v18, v137 row_newbcast:5 \n"
" v_mul_f32 v138, v18, v138 row_newbcast:6 \n"
" v_mul_f32 v139, v18, v139 row_newbcast:7 \n"
" v_mul_f32 v140, v18, v140 row_newbcast:4 \n"
" v_mul_f32 v141, v18, v141 row_newbcast:5 \n"
" v_mul_f32 v142, v18, v142 row_newbcast:6 \n"
" v_mul_f32 v143, v18, v143 row_newbcast:7 \n"
" v_mul_f32 v144, v18, v144 row_newbcast:8 \n"
" v_mul_f32 v145, v18, v145 row_newbcast:9 \n"
" v_mul_f32 v146, v18, v146 row_newbcast:10 \n"
" v_mul_f32 v147, v18, v147 row_newbcast:11 \n"
" v_mul_f32 v148, v18, v148 row_newbcast:8 \n"
" v_mul_f32 v149, v18, v149 row_newbcast:9 \n"
" v_mul_f32 v150, v18, v150 row_newbcast:10 \n"
" v_mul_f32 v151, v18, v151 row_newbcast:11 \n"
" v_mul_f32 v152, v18, v152 row_newbcast:12 \n"
" v_mul_f32 v153, v18, v153 row_newbcast:13 \n"
" v_mul_f32 v154, v18, v154 row_newbcast:14 \n"
" v_mul_f32 v155, v18, v155 row_newbcast:15 \n"
" v_mul_f32 v156, v18, v156 row_newbcast:12 \n"
" v_mul_f32 v157, v18, v157 row_newbcast:13 \n"
" v_mul_f32 v158, v18, v158 row_newbcast:14 \n"
" v_mul_f32 v159, v18, v159 row_newbcast:15 \n"
" v_mul_f32 v160, v19, v160 row_newbcast:0 \n"
" v_mul_f32 v161, v19, v161 row_newbcast:1 \n"
" v_mul_f32 v162, v19, v162 row_newbcast:2 \n"
" v_mul_f32 v163, v19, v163 row_newbcast:3 \n"
" v_mul_f32 v164, v19, v164 row_newbcast:0 \n"
" v_mul_f32 v165, v19, v165 row_newbcast:1 \n"
" v_mul_f32 v166, v19, v166 row_newbcast:2 \n"
" v_mul_f32 v167, v19, v167 row_newbcast:3 \n"
" v_mul_f32 v168, v19, v168 row_newbcast:4 \n"
" v_mul_f32 v169, v19, v169 row_newbcast:5 \n"
" v_mul_f32 v170, v19, v170 row_newbcast:6 \n"
" v_mul_f32 v171, v19, v171 row_newbcast:7 \n"
" v_mul_f32 v172, v19, v172 row_newbcast:4 \n"
" v_mul_f32 v173, v19, v173 row_newbcast:5 \n"
" v_mul_f32 v174, v19, v174 row_newbcast:6 \n"
" v_mul_f32 v175, v19, v175 row_newbcast:7 \n"
" v_mul_f32 v176, v19, v176 row_newbcast:8 \n"
" v_mul_f32 v177, v19, v177 row_newbcast:9 \n"
" v_mul_f32 v178, v19, v178 row_newbcast:10 \n"
" v_mul_f32 v179, v19, v179 row_newbcast:11 \n"
" v_mul_f32 v180, v19, v180 row_newbcast:8 \n"
" v_mul_f32 v181, v19, v181 row_newbcast:9 \n"
" v_mul_f32 v182, v19, v182 row_newbcast:10 \n"
" v_mul_f32 v183, v19, v183 row_newbcast:11 \n"
" v_mul_f32 v184, v19, v184 row_newbcast:12 \n"
" v_mul_f32 v185, v19, v185 row_newbcast:13 \n"
" v_mul_f32 v186, v19, v186 row_newbcast:14 \n"
" v_mul_f32 v187, v19, v187 row_newbcast:15 \n"
" v_mul_f32 v188, v19, v188 row_newbcast:12 \n"
" v_mul_f32 v189, v19, v189 row_newbcast:13 \n"
" v_mul_f32 v190, v19, v190 row_newbcast:14 \n"
" v_mul_f32 v191, v19, v191 row_newbcast:15 \n"
" buffer_load_dword v12, v5, s[16:19], 0 offen \n"
" v_mov_b32 v22, 0x358637bd \n"
" v_mov_b32 v23, 0x358637bd \n"
" v_max3_f32 v22, abs(v128), abs(v129), v22 \n"
" v_max3_f32 v22, abs(v130), abs(v131), v22 \n"
" v_max3_f32 v23, abs(v132), abs(v133), v23 \n"
" v_max3_f32 v23, abs(v134), abs(v135), v23 \n"
" v_max3_f32 v22, abs(v136), abs(v137), v22 \n"
" v_max3_f32 v22, abs(v138), abs(v139), v22 \n"
" v_max3_f32 v23, abs(v140), abs(v141), v23 \n"
" v_max3_f32 v23, abs(v142), abs(v143), v23 \n"
" v_max3_f32 v22, abs(v144), abs(v145), v22 \n"
" v_max3_f32 v22, abs(v146), abs(v147), v22 \n"
" v_max3_f32 v23, abs(v148), abs(v149), v23 \n"
" v_max3_f32 v23, abs(v150), abs(v151), v23 \n"
" v_max3_f32 v22, abs(v152), abs(v153), v22 \n"
" v_max3_f32 v22, abs(v154), abs(v155), v22 \n"
" v_max3_f32 v23, abs(v156), abs(v157), v23 \n"
" v_max3_f32 v23, abs(v158), abs(v159), v23 \n"
" v_max3_f32 v22, abs(v160), abs(v161), v22 \n"
" v_max3_f32 v22, abs(v162), abs(v163), v22 \n"
" v_max3_f32 v23, abs(v164), abs(v165), v23 \n"
" v_max3_f32 v23, abs(v166), abs(v167), v23 \n"
" v_max3_f32 v22, abs(v168), abs(v169), v22 \n"
" v_max3_f32 v22, abs(v170), abs(v171), v22 \n"
" v_max3_f32 v23, abs(v172), abs(v173), v23 \n"
" v_max3_f32 v23, abs(v174), abs(v175), v23 \n"
" v_max3_f32 v22, abs(v176), abs(v177), v22 \n"
" v_max3_f32 v22, abs(v178), abs(v179), v22 \n"
" v_max3_f32 v23, abs(v180), abs(v181), v23 \n"
" v_max3_f32 v23, abs(v182), abs(v183), v23 \n"
" v_max3_f32 v22, abs(v184), abs(v185), v22 \n"
" v_max3_f32 v22, abs(v186), abs(v187), v22 \n"
" v_max3_f32 v23, abs(v188), abs(v189), v23 \n"
" v_max3_f32 v23, abs(v190), abs(v191), v23 \n"
" v_lshlrev_b32 v54, 3, v0 \n"
" s_mul_i32 s60, 0x00000200, s7 \n"
" v_add_u32 v54, s60, v54 \n"
" ds_write_b64 v54, v[22:23] offset:16640 \n"
" s_waitcnt lgkmcnt(0) \n"
" s_barrier \n"
" v_and_b32 v54, 15, v0 \n"
" v_lshlrev_b32 v54, 3, v54 \n"
" ds_read_b64 v[96:97], v54 offset:16640 \n"
" ds_read_b64 v[98:99], v54 offset:16768 \n"
" ds_read_b64 v[100:101], v54 offset:16896 \n"
" ds_read_b64 v[102:103], v54 offset:17024 \n"
" ds_read_b64 v[104:105], v54 offset:17152 \n"
" ds_read_b64 v[106:107], v54 offset:17280 \n"
" ds_read_b64 v[108:109], v54 offset:17408 \n"
" ds_read_b64 v[110:111], v54 offset:17536 \n"
" ds_read_b64 v[112:113], v54 offset:17664 \n"
" ds_read_b64 v[114:115], v54 offset:17792 \n"
" ds_read_b64 v[116:117], v54 offset:17920 \n"
" ds_read_b64 v[118:119], v54 offset:18048 \n"
" ds_read_b64 v[120:121], v54 offset:18176 \n"
" ds_read_b64 v[122:123], v54 offset:18304 \n"
" ds_read_b64 v[124:125], v54 offset:18432 \n"
" ds_read_b64 v[126:127], v54 offset:18560 \n"
" s_waitcnt lgkmcnt(0) \n"
" v_max3_f32 v22, abs(v96), abs(v98), v22 \n"
" v_max3_f32 v23, abs(v97), abs(v99), v23 \n"
" v_max3_f32 v22, abs(v100), abs(v102), v22 \n"
" v_max3_f32 v23, abs(v101), abs(v103), v23 \n"
" v_max3_f32 v22, abs(v104), abs(v106), v22 \n"
" v_max3_f32 v23, abs(v105), abs(v107), v23 \n"
" v_max3_f32 v22, abs(v108), abs(v110), v22 \n"
" v_max3_f32 v23, abs(v109), abs(v111), v23 \n"
" v_max3_f32 v22, abs(v112), abs(v114), v22 \n"
" v_max3_f32 v23, abs(v113), abs(v115), v23 \n"
" v_max3_f32 v22, abs(v116), abs(v118), v22 \n"
" v_max3_f32 v23, abs(v117), abs(v119), v23 \n"
" v_max3_f32 v22, abs(v120), abs(v122), v22 \n"
" v_max3_f32 v23, abs(v121), abs(v123), v23 \n"
" v_max3_f32 v22, abs(v124), abs(v126), v22 \n"
" v_max3_f32 v23, abs(v125), abs(v127), v23 \n"
" v_rcp_f32 v22, v22 \n"
" v_rcp_f32 v23, v23 \n"
" v_mul_f32 v22, 0x42fe0000, v22 \n"
" v_mul_f32 v23, 0x42fe0000, v23 \n"
" v_mul_f32 v128, v22, v128 \n"
" v_mul_f32 v129, v22, v129 \n"
" v_mul_f32 v130, v22, v130 \n"
" v_mul_f32 v131, v22, v131 \n"
" v_cvt_i32_f32 v128, v128 \n"
" v_cvt_i32_f32 v129, v129 \n"
" v_cvt_i32_f32 v130, v130 \n"
" v_cvt_i32_f32 v131, v131 \n"
" v_perm_b32 v128, v129, v128, s53 \n"
" v_perm_b32 v128, v130, v128, s54 \n"
" v_perm_b32 v128, v131, v128, s55 \n"
" v_mul_f32 v132, v23, v132 \n"
" v_mul_f32 v133, v23, v133 \n"
" v_mul_f32 v134, v23, v134 \n"
" v_mul_f32 v135, v23, v135 \n"
" v_cvt_i32_f32 v132, v132 \n"
" v_cvt_i32_f32 v133, v133 \n"
" v_cvt_i32_f32 v134, v134 \n"
" v_cvt_i32_f32 v135, v135 \n"
" v_perm_b32 v129, v133, v132, s53 \n"
" v_perm_b32 v129, v134, v129, s54 \n"
" v_perm_b32 v129, v135, v129, s55 \n"
" v_mul_f32 v136, v22, v136 \n"
" v_mul_f32 v137, v22, v137 \n"
" v_mul_f32 v138, v22, v138 \n"
" v_mul_f32 v139, v22, v139 \n"
" v_cvt_i32_f32 v136, v136 \n"
" v_cvt_i32_f32 v137, v137 \n"
" v_cvt_i32_f32 v138, v138 \n"
" v_cvt_i32_f32 v139, v139 \n"
" v_perm_b32 v130, v137, v136, s53 \n"
" v_perm_b32 v130, v138, v130, s54 \n"
" v_perm_b32 v130, v139, v130, s55 \n"
" v_mul_f32 v140, v23, v140 \n"
" v_mul_f32 v141, v23, v141 \n"
" v_mul_f32 v142, v23, v142 \n"
" v_mul_f32 v143, v23, v143 \n"
" v_cvt_i32_f32 v140, v140 \n"
" v_cvt_i32_f32 v141, v141 \n"
" v_cvt_i32_f32 v142, v142 \n"
" v_cvt_i32_f32 v143, v143 \n"
" v_perm_b32 v131, v141, v140, s53 \n"
" v_perm_b32 v131, v142, v131, s54 \n"
" v_perm_b32 v131, v143, v131, s55 \n"
" v_mul_f32 v144, v22, v144 \n"
" v_mul_f32 v145, v22, v145 \n"
" v_mul_f32 v146, v22, v146 \n"
" v_mul_f32 v147, v22, v147 \n"
" v_cvt_i32_f32 v144, v144 \n"
" v_cvt_i32_f32 v145, v145 \n"
" v_cvt_i32_f32 v146, v146 \n"
" v_cvt_i32_f32 v147, v147 \n"
" v_perm_b32 v132, v145, v144, s53 \n"
" v_perm_b32 v132, v146, v132, s54 \n"
" v_perm_b32 v132, v147, v132, s55 \n"
" v_mul_f32 v148, v23, v148 \n"
" v_mul_f32 v149, v23, v149 \n"
" v_mul_f32 v150, v23, v150 \n"
" v_mul_f32 v151, v23, v151 \n"
" v_cvt_i32_f32 v148, v148 \n"
" v_cvt_i32_f32 v149, v149 \n"
" v_cvt_i32_f32 v150, v150 \n"
" v_cvt_i32_f32 v151, v151 \n"
" v_perm_b32 v133, v149, v148, s53 \n"
" v_perm_b32 v133, v150, v133, s54 \n"
" v_perm_b32 v133, v151, v133, s55 \n"
" v_mul_f32 v152, v22, v152 \n"
" v_mul_f32 v153, v22, v153 \n"
" v_mul_f32 v154, v22, v154 \n"
" v_mul_f32 v155, v22, v155 \n"
" v_cvt_i32_f32 v152, v152 \n"
" v_cvt_i32_f32 v153, v153 \n"
" v_cvt_i32_f32 v154, v154 \n"
" v_cvt_i32_f32 v155, v155 \n"
" v_perm_b32 v134, v153, v152, s53 \n"
" v_perm_b32 v134, v154, v134, s54 \n"
" v_perm_b32 v134, v155, v134, s55 \n"
" v_mul_f32 v156, v23, v156 \n"
" v_mul_f32 v157, v23, v157 \n"
" v_mul_f32 v158, v23, v158 \n"
" v_mul_f32 v159, v23, v159 \n"
" v_cvt_i32_f32 v156, v156 \n"
" v_cvt_i32_f32 v157, v157 \n"
" v_cvt_i32_f32 v158, v158 \n"
" v_cvt_i32_f32 v159, v159 \n"
" v_perm_b32 v135, v157, v156, s53 \n"
" v_perm_b32 v135, v158, v135, s54 \n"
" v_perm_b32 v135, v159, v135, s55 \n"
" v_mul_f32 v160, v22, v160 \n"
" v_mul_f32 v161, v22, v161 \n"
" v_mul_f32 v162, v22, v162 \n"
" v_mul_f32 v163, v22, v163 \n"
" v_cvt_i32_f32 v160, v160 \n"
" v_cvt_i32_f32 v161, v161 \n"
" v_cvt_i32_f32 v162, v162 \n"
" v_cvt_i32_f32 v163, v163 \n"
" v_perm_b32 v136, v161, v160, s53 \n"
" v_perm_b32 v136, v162, v136, s54 \n"
" v_perm_b32 v136, v163, v136, s55 \n"
" v_mul_f32 v164, v23, v164 \n"
" v_mul_f32 v165, v23, v165 \n"
" v_mul_f32 v166, v23, v166 \n"
" v_mul_f32 v167, v23, v167 \n"
" v_cvt_i32_f32 v164, v164 \n"
" v_cvt_i32_f32 v165, v165 \n"
" v_cvt_i32_f32 v166, v166 \n"
" v_cvt_i32_f32 v167, v167 \n"
" v_perm_b32 v137, v165, v164, s53 \n"
" v_perm_b32 v137, v166, v137, s54 \n"
" v_perm_b32 v137, v167, v137, s55 \n"
" v_mul_f32 v168, v22, v168 \n"
" v_mul_f32 v169, v22, v169 \n"
" v_mul_f32 v170, v22, v170 \n"
" v_mul_f32 v171, v22, v171 \n"
" v_cvt_i32_f32 v168, v168 \n"
" v_cvt_i32_f32 v169, v169 \n"
" v_cvt_i32_f32 v170, v170 \n"
" v_cvt_i32_f32 v171, v171 \n"
" v_perm_b32 v138, v169, v168, s53 \n"
" v_perm_b32 v138, v170, v138, s54 \n"
" v_perm_b32 v138, v171, v138, s55 \n"
" v_mul_f32 v172, v23, v172 \n"
" v_mul_f32 v173, v23, v173 \n"
" v_mul_f32 v174, v23, v174 \n"
" v_mul_f32 v175, v23, v175 \n"
" v_cvt_i32_f32 v172, v172 \n"
" v_cvt_i32_f32 v173, v173 \n"
" v_cvt_i32_f32 v174, v174 \n"
" v_cvt_i32_f32 v175, v175 \n"
" v_perm_b32 v139, v173, v172, s53 \n"
" v_perm_b32 v139, v174, v139, s54 \n"
" v_perm_b32 v139, v175, v139, s55 \n"
" v_mul_f32 v176, v22, v176 \n"
" v_mul_f32 v177, v22, v177 \n"
" v_mul_f32 v178, v22, v178 \n"
" v_mul_f32 v179, v22, v179 \n"
" v_cvt_i32_f32 v176, v176 \n"
" v_cvt_i32_f32 v177, v177 \n"
" v_cvt_i32_f32 v178, v178 \n"
" v_cvt_i32_f32 v179, v179 \n"
" v_perm_b32 v140, v177, v176, s53 \n"
" v_perm_b32 v140, v178, v140, s54 \n"
" v_perm_b32 v140, v179, v140, s55 \n"
" v_mul_f32 v180, v23, v180 \n"
" v_mul_f32 v181, v23, v181 \n"
" v_mul_f32 v182, v23, v182 \n"
" v_mul_f32 v183, v23, v183 \n"
" v_cvt_i32_f32 v180, v180 \n"
" v_cvt_i32_f32 v181, v181 \n"
" v_cvt_i32_f32 v182, v182 \n"
" v_cvt_i32_f32 v183, v183 \n"
" v_perm_b32 v141, v181, v180, s53 \n"
" v_perm_b32 v141, v182, v141, s54 \n"
" v_perm_b32 v141, v183, v141, s55 \n"
" v_mul_f32 v184, v22, v184 \n"
" v_mul_f32 v185, v22, v185 \n"
" v_mul_f32 v186, v22, v186 \n"
" v_mul_f32 v187, v22, v187 \n"
" v_cvt_i32_f32 v184, v184 \n"
" v_cvt_i32_f32 v185, v185 \n"
" v_cvt_i32_f32 v186, v186 \n"
" v_cvt_i32_f32 v187, v187 \n"
" v_perm_b32 v142, v185, v184, s53 \n"
" v_perm_b32 v142, v186, v142, s54 \n"
" v_perm_b32 v142, v187, v142, s55 \n"
" v_mul_f32 v188, v23, v188 \n"
" v_mul_f32 v189, v23, v189 \n"
" v_mul_f32 v190, v23, v190 \n"
" v_mul_f32 v191, v23, v191 \n"
" v_cvt_i32_f32 v188, v188 \n"
" v_cvt_i32_f32 v189, v189 \n"
" v_cvt_i32_f32 v190, v190 \n"
" v_cvt_i32_f32 v191, v191 \n"
" v_perm_b32 v143, v189, v188, s53 \n"
" v_perm_b32 v143, v190, v143, s54 \n"
" v_perm_b32 v143, v191, v143, s55 \n"
" v_rcp_f32 v24, v22 \n"
" v_rcp_f32 v25, v23 \n"
" v_lshrrev_b32 v54, 5, v0 \n"
" v_lshlrev_b32 v55, 5, v54 \n"
" v_and_b32 v54, 31, v0 \n"
" v_lshrrev_b32 v56, 4, v54 \n"
" v_add_u32 v55, v56, v55 \n"
" v_and_b32 v54, 15, v0 \n"
" v_lshlrev_b32 v54, 1, v54 \n"
" v_add_u32 v55, v54, v55 \n"
" v_lshlrev_b32 v54, 2, v55 \n"
" s_mul_i32 s60, 0x00000100, s7 \n"
" v_add_u32 v54, v54, s60 \n"
" ds_write_b32 v54, v128 offset:18688 \n"
" ds_write_b32 v54, v129 offset:26880 \n"
" ds_write_b32 v54, v130 offset:19712 \n"
" ds_write_b32 v54, v131 offset:27904 \n"
" ds_write_b32 v54, v132 offset:20736 \n"
" ds_write_b32 v54, v133 offset:28928 \n"
" ds_write_b32 v54, v134 offset:21760 \n"
" ds_write_b32 v54, v135 offset:29952 \n"
" ds_write_b32 v54, v136 offset:22784 \n"
" ds_write_b32 v54, v137 offset:30976 \n"
" ds_write_b32 v54, v138 offset:23808 \n"
" ds_write_b32 v54, v139 offset:32000 \n"
" ds_write_b32 v54, v140 offset:24832 \n"
" ds_write_b32 v54, v141 offset:33024 \n"
" ds_write_b32 v54, v142 offset:25856 \n"
" ds_write_b32 v54, v143 offset:34048 \n"
" s_waitcnt lgkmcnt(0) \n"
" s_barrier \n"
" v_lshrrev_b32 v54, 4, v0 \n"
" v_lshlrev_b32 v55, 6, v54 \n"
" v_and_b32 v54, 15, v0 \n"
" v_lshlrev_b32 v54, 1, v54 \n"
" v_add_u32 v55, v54, v55 \n"
" v_lshlrev_b32 v54, 2, v55 \n"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_INT8
# define _UK_MFMA_ "v_mfma_i32_16x16x32_i8"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \
" v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
" v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \
" v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
" v_perm_b32 " y_ ", v55, v54, s52 \n"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, " x0_ " \n" \
" v_cvt_f16_f32 v55, " x1_ " \n" \
" v_pack_b32_f16 " y_ ", v54, v55 \n"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
" ds_read_b64 v[128:129], v54 offset:18688 \n"
" ds_read_b64 v[130:131], v54 offset:18816 \n"
" ds_read_b64 v[132:133], v54 offset:19712 \n"
" ds_read_b64 v[134:135], v54 offset:19840 \n"
" ds_read_b64 v[136:137], v54 offset:20736 \n"
" ds_read_b64 v[138:139], v54 offset:20864 \n"
" ds_read_b64 v[140:141], v54 offset:21760 \n"
" ds_read_b64 v[142:143], v54 offset:21888 \n"
" ds_read_b64 v[144:145], v54 offset:22784 \n"
" ds_read_b64 v[146:147], v54 offset:22912 \n"
" ds_read_b64 v[148:149], v54 offset:23808 \n"
" ds_read_b64 v[150:151], v54 offset:23936 \n"
" ds_read_b64 v[152:153], v54 offset:24832 \n"
" ds_read_b64 v[154:155], v54 offset:24960 \n"
" ds_read_b64 v[156:157], v54 offset:25856 \n"
" ds_read_b64 v[158:159], v54 offset:25984 \n"
" ds_read_b64 v[160:161], v54 offset:26880 \n"
" ds_read_b64 v[162:163], v54 offset:27008 \n"
" ds_read_b64 v[164:165], v54 offset:27904 \n"
" ds_read_b64 v[166:167], v54 offset:28032 \n"
" ds_read_b64 v[168:169], v54 offset:28928 \n"
" ds_read_b64 v[170:171], v54 offset:29056 \n"
" ds_read_b64 v[172:173], v54 offset:29952 \n"
" ds_read_b64 v[174:175], v54 offset:30080 \n"
" ds_read_b64 v[176:177], v54 offset:30976 \n"
" ds_read_b64 v[178:179], v54 offset:31104 \n"
" ds_read_b64 v[180:181], v54 offset:32000 \n"
" ds_read_b64 v[182:183], v54 offset:32128 \n"
" ds_read_b64 v[184:185], v54 offset:33024 \n"
" ds_read_b64 v[186:187], v54 offset:33152 \n"
" ds_read_b64 v[188:189], v54 offset:34048 \n"
" ds_read_b64 v[190:191], v54 offset:34176 \n"
" s_add_u32 s12, %[s_tile_os_b], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_add_u32 s16, %[s_tile_os_dq], s16 \n"
" s_addc_u32 s17, 0, s17 \n"
" s_mov_b32 s80, 0 \n"
" s_waitcnt 0x0000 \n"
"label_0C3C: \n"
" s_waitcnt vmcnt(41) \n"
" s_barrier \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195]\n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195]\n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199]\n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199]\n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203]\n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203]\n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207]\n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207]\n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207]\n"
" s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211]\n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211]\n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215]\n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215]\n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219]\n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219]\n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223]\n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223]\n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
" s_add_u32 s12, %[s_tile_os_b_half], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[62:63], v[174:175], v[220:223]\n"
" s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195]\n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b0], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195]\n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199]\n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199]\n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203]\n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b1], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203]\n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207]\n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207]\n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207]\n"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207]\n"
" s_waitcnt vmcnt(40) \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211]\n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b2], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211]\n"
" buffer_load_dword v13, v5, s[16:19], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211]\n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215]\n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215]\n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219]\n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b3], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219]\n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223]\n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223]\n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223]\n"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223]\n"
" s_add_u32 s60, 0x00000200, s80 \n"
" s_cmp_lt_u32 s60, s81 \n"
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0 \n"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0 \n"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0 \n"
" s_add_u32 s12, %[s_tile_os_b], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_add_u32 s16, %[s_tile_os_dq], s16 \n"
" s_addc_u32 s17, 0, s17 \n"
" v_cvt_f32_i32 v192, v192 \n"
" v_cvt_f32_i32 v193, v193 \n"
" v_cvt_f32_i32 v194, v194 \n"
" v_cvt_f32_i32 v195, v195 \n"
" v_mul_f32 v192, v24, v192 \n"
" v_mul_f32 v193, v24, v193 \n"
" v_mul_f32 v194, v24, v194 \n"
" v_mul_f32 v195, v24, v195 \n"
" v_mul_f32 v192, v12, v192 row_newbcast:0 \n"
" v_mul_f32 v193, v12, v193 row_newbcast:1 \n"
" v_mul_f32 v194, v12, v194 row_newbcast:2 \n"
" v_mul_f32 v195, v12, v195 row_newbcast:3 \n"
" v_mul_f32 v192, v20, v192 \n"
" v_mul_f32 v193, v20, v193 \n"
" v_mul_f32 v194, v20, v194 \n"
" v_mul_f32 v195, v20, v195 \n"
" v_cvt_f32_i32 v196, v196 \n"
" v_cvt_f32_i32 v197, v197 \n"
" v_cvt_f32_i32 v198, v198 \n"
" v_cvt_f32_i32 v199, v199 \n"
" v_mul_f32 v196, v25, v196 \n"
" v_mul_f32 v197, v25, v197 \n"
" v_mul_f32 v198, v25, v198 \n"
" v_mul_f32 v199, v25, v199 \n"
" v_mul_f32 v196, v12, v196 row_newbcast:0 \n"
" v_mul_f32 v197, v12, v197 row_newbcast:1 \n"
" v_mul_f32 v198, v12, v198 row_newbcast:2 \n"
" v_mul_f32 v199, v12, v199 row_newbcast:3 \n"
" v_mul_f32 v196, v21, v196 \n"
" v_mul_f32 v197, v21, v197 \n"
" v_mul_f32 v198, v21, v198 \n"
" v_mul_f32 v199, v21, v199 \n"
" v_cvt_f32_i32 v200, v200 \n"
" v_cvt_f32_i32 v201, v201 \n"
" v_cvt_f32_i32 v202, v202 \n"
" v_cvt_f32_i32 v203, v203 \n"
" v_mul_f32 v200, v24, v200 \n"
" v_mul_f32 v201, v24, v201 \n"
" v_mul_f32 v202, v24, v202 \n"
" v_mul_f32 v203, v24, v203 \n"
" v_mul_f32 v200, v12, v200 row_newbcast:4 \n"
" v_mul_f32 v201, v12, v201 row_newbcast:5 \n"
" v_mul_f32 v202, v12, v202 row_newbcast:6 \n"
" v_mul_f32 v203, v12, v203 row_newbcast:7 \n"
" v_mul_f32 v200, v20, v200 \n"
" v_mul_f32 v201, v20, v201 \n"
" v_mul_f32 v202, v20, v202 \n"
" v_mul_f32 v203, v20, v203 \n"
" v_cvt_f32_i32 v204, v204 \n"
" v_cvt_f32_i32 v205, v205 \n"
" v_cvt_f32_i32 v206, v206 \n"
" v_cvt_f32_i32 v207, v207 \n"
" v_mul_f32 v204, v25, v204 \n"
" v_mul_f32 v205, v25, v205 \n"
" v_mul_f32 v206, v25, v206 \n"
" v_mul_f32 v207, v25, v207 \n"
" v_mul_f32 v204, v12, v204 row_newbcast:4 \n"
" v_mul_f32 v205, v12, v205 row_newbcast:5 \n"
" v_mul_f32 v206, v12, v206 row_newbcast:6 \n"
" v_mul_f32 v207, v12, v207 row_newbcast:7 \n"
" v_mul_f32 v204, v21, v204 \n"
" v_mul_f32 v205, v21, v205 \n"
" v_mul_f32 v206, v21, v206 \n"
" v_mul_f32 v207, v21, v207 \n"
" v_cvt_f32_i32 v208, v208 \n"
" v_cvt_f32_i32 v209, v209 \n"
" v_cvt_f32_i32 v210, v210 \n"
" v_cvt_f32_i32 v211, v211 \n"
" v_mul_f32 v208, v24, v208 \n"
" v_mul_f32 v209, v24, v209 \n"
" v_mul_f32 v210, v24, v210 \n"
" v_mul_f32 v211, v24, v211 \n"
" v_mul_f32 v208, v12, v208 row_newbcast:8 \n"
" v_mul_f32 v209, v12, v209 row_newbcast:9 \n"
" v_mul_f32 v210, v12, v210 row_newbcast:10 \n"
" v_mul_f32 v211, v12, v211 row_newbcast:11 \n"
" v_mul_f32 v208, v20, v208 \n"
" v_mul_f32 v209, v20, v209 \n"
" v_mul_f32 v210, v20, v210 \n"
" v_mul_f32 v211, v20, v211 \n"
" v_cvt_f32_i32 v212, v212 \n"
" v_cvt_f32_i32 v213, v213 \n"
" v_cvt_f32_i32 v214, v214 \n"
" v_cvt_f32_i32 v215, v215 \n"
" v_mul_f32 v212, v25, v212 \n"
" v_mul_f32 v213, v25, v213 \n"
" v_mul_f32 v214, v25, v214 \n"
" v_mul_f32 v215, v25, v215 \n"
" v_mul_f32 v212, v12, v212 row_newbcast:8 \n"
" v_mul_f32 v213, v12, v213 row_newbcast:9 \n"
" v_mul_f32 v214, v12, v214 row_newbcast:10 \n"
" v_mul_f32 v215, v12, v215 row_newbcast:11 \n"
" v_mul_f32 v212, v21, v212 \n"
" v_mul_f32 v213, v21, v213 \n"
" v_mul_f32 v214, v21, v214 \n"
" v_mul_f32 v215, v21, v215 \n"
" v_cvt_f32_i32 v216, v216 \n"
" v_cvt_f32_i32 v217, v217 \n"
" v_cvt_f32_i32 v218, v218 \n"
" v_cvt_f32_i32 v219, v219 \n"
" v_mul_f32 v216, v24, v216 \n"
" v_mul_f32 v217, v24, v217 \n"
" v_mul_f32 v218, v24, v218 \n"
" v_mul_f32 v219, v24, v219 \n"
" v_mul_f32 v216, v12, v216 row_newbcast:12 \n"
" v_mul_f32 v217, v12, v217 row_newbcast:13 \n"
" v_mul_f32 v218, v12, v218 row_newbcast:14 \n"
" v_mul_f32 v219, v12, v219 row_newbcast:15 \n"
" v_mul_f32 v216, v20, v216 \n"
" v_mul_f32 v217, v20, v217 \n"
" v_mul_f32 v218, v20, v218 \n"
" v_mul_f32 v219, v20, v219 \n"
" v_cvt_f32_i32 v220, v220 \n"
" v_cvt_f32_i32 v221, v221 \n"
" v_cvt_f32_i32 v222, v222 \n"
" v_cvt_f32_i32 v223, v223 \n"
" v_mul_f32 v220, v25, v220 \n"
" v_mul_f32 v221, v25, v221 \n"
" v_mul_f32 v222, v25, v222 \n"
" v_mul_f32 v223, v25, v223 \n"
" v_mul_f32 v220, v12, v220 row_newbcast:12 \n"
" v_mul_f32 v221, v12, v221 row_newbcast:13 \n"
" v_mul_f32 v222, v12, v222 row_newbcast:14 \n"
" v_mul_f32 v223, v12, v223 row_newbcast:15 \n"
" v_mul_f32 v220, v21, v220 \n"
" v_mul_f32 v221, v21, v221 \n"
" v_mul_f32 v222, v21, v222 \n"
" v_mul_f32 v223, v21, v223 \n"
" v_cmp_u_f32 s[48:49], v192, v192 \n"
" v_add3_u32 v50, v192, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v193, v193 \n"
" v_add3_u32 v50, v193, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v192, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v194, v194 \n"
" v_add3_u32 v50, v194, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v195, v195 \n"
" v_add3_u32 v50, v195, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v193, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v196, v196 \n"
" v_add3_u32 v50, v196, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v197, v197 \n"
" v_add3_u32 v50, v197, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v194, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v198, v198 \n"
" v_add3_u32 v50, v198, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v199, v199 \n"
" v_add3_u32 v50, v199, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v195, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v200, v200 \n"
" v_add3_u32 v50, v200, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v201, v201 \n"
" v_add3_u32 v50, v201, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v196, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v202, v202 \n"
" v_add3_u32 v50, v202, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v203, v203 \n"
" v_add3_u32 v50, v203, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v197, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v204, v204 \n"
" v_add3_u32 v50, v204, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v205, v205 \n"
" v_add3_u32 v50, v205, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v198, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v206, v206 \n"
" v_add3_u32 v50, v206, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v207, v207 \n"
" v_add3_u32 v50, v207, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v199, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v208, v208 \n"
" v_add3_u32 v50, v208, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v209, v209 \n"
" v_add3_u32 v50, v209, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v200, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v210, v210 \n"
" v_add3_u32 v50, v210, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v211, v211 \n"
" v_add3_u32 v50, v211, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v201, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v212, v212 \n"
" v_add3_u32 v50, v212, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v213, v213 \n"
" v_add3_u32 v50, v213, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v202, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v214, v214 \n"
" v_add3_u32 v50, v214, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v215, v215 \n"
" v_add3_u32 v50, v215, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v203, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v216, v216 \n"
" v_add3_u32 v50, v216, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v217, v217 \n"
" v_add3_u32 v50, v217, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v204, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v218, v218 \n"
" v_add3_u32 v50, v218, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v219, v219 \n"
" v_add3_u32 v50, v219, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v205, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v220, v220 \n"
" v_add3_u32 v50, v220, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v221, v221 \n"
" v_add3_u32 v50, v221, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v206, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v222, v222 \n"
" v_add3_u32 v50, v222, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v223, v223 \n"
" v_add3_u32 v50, v223, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v207, v55, v54, s52 \n"
" ds_write_b64 v3, v[192:193] offset:35072 \n"
" ds_write_b64 v3, v[194:195] offset:43776 \n"
" ds_write_b64 v3, v[196:197] offset:37248 \n"
" ds_write_b64 v3, v[198:199] offset:45952 \n"
" ds_write_b64 v3, v[200:201] offset:39424 \n"
" ds_write_b64 v3, v[202:203] offset:48128 \n"
" ds_write_b64 v3, v[204:205] offset:41600 \n"
" ds_write_b64 v3, v[206:207] offset:50304 \n"
" s_waitcnt lgkmcnt(0) \n"
" s_barrier \n"
" ds_read_b32 v64, v4 offset:35072 \n"
" ds_read_b32 v65, v4 offset:39424 \n"
" ds_read_b32 v66, v4 offset:35104 \n"
" ds_read_b32 v67, v4 offset:39456 \n"
" ds_read_b32 v68, v4 offset:35136 \n"
" ds_read_b32 v69, v4 offset:39488 \n"
" ds_read_b32 v70, v4 offset:35168 \n"
" ds_read_b32 v71, v4 offset:39520 \n"
" ds_read_b32 v72, v4 offset:43776 \n"
" ds_read_b32 v73, v4 offset:48128 \n"
" ds_read_b32 v74, v4 offset:43808 \n"
" ds_read_b32 v75, v4 offset:48160 \n"
" ds_read_b32 v76, v4 offset:43840 \n"
" ds_read_b32 v77, v4 offset:48192 \n"
" ds_read_b32 v78, v4 offset:43872 \n"
" ds_read_b32 v79, v4 offset:48224 \n"
" s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, s[20:21] \n"
" global_atomic_pk_add_bf16 v80, v64, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[20:21] \n"
" global_atomic_pk_add_bf16 v80, v65, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[22:23] \n"
" global_atomic_pk_add_bf16 v82, v66, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[22:23] \n"
" global_atomic_pk_add_bf16 v82, v67, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 v84, v68, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 v84, v69, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 v86, v70, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 v86, v71, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 v88, v72, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 v88, v73, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 v90, v74, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 v90, v75, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 v92, v76, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 v92, v77, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 v94, v78, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 v94, v79, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_add_u32 s8, s59, s8 \n"
" s_addc_u32 s9, 0, s9 \n"
" s_addk_i32 s80, 0x0100 \n"
" s_cmp_lt_i32 s80, s81 \n"
" s_cbranch_scc0 label_2301 \n"
" s_waitcnt vmcnt(41) \n"
" s_barrier \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227]\n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227]\n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231]\n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231]\n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235]\n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235]\n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239]\n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239]\n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239]\n"
" s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243]\n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243]\n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247]\n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247]\n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251]\n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251]\n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255]\n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255]\n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
" s_add_u32 s12, %[s_tile_os_b_half], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[190:191], v[174:175], v[252:255]\n"
" s_waitcnt vmcnt(41) \n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227]\n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227]\n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231]\n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231]\n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235]\n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235]\n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239]\n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239]\n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239]\n"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239]\n"
" s_waitcnt vmcnt(40) \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243]\n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243]\n"
" buffer_load_dword v12, v5, s[16:19], 0 offen \n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243]\n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247]\n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247]\n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251]\n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251]\n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255]\n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255]\n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255]\n"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255]\n"
" s_add_u32 s60, 0x00000200, s80 \n"
" s_cmp_lt_u32 s60, s81 \n"
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0 \n"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0 \n"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0 \n"
" s_add_u32 s12, %[s_tile_os_b], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_add_u32 s16, %[s_tile_os_dq], s16 \n"
" s_addc_u32 s17, 0, s17 \n"
" v_cvt_f32_i32 v224, v224 \n"
" v_cvt_f32_i32 v225, v225 \n"
" v_cvt_f32_i32 v226, v226 \n"
" v_cvt_f32_i32 v227, v227 \n"
" v_mul_f32 v224, v24, v224 \n"
" v_mul_f32 v225, v24, v225 \n"
" v_mul_f32 v226, v24, v226 \n"
" v_mul_f32 v227, v24, v227 \n"
" v_mul_f32 v224, v13, v224 row_newbcast:0 \n"
" v_mul_f32 v225, v13, v225 row_newbcast:1 \n"
" v_mul_f32 v226, v13, v226 row_newbcast:2 \n"
" v_mul_f32 v227, v13, v227 row_newbcast:3 \n"
" v_mul_f32 v224, v20, v224 \n"
" v_mul_f32 v225, v20, v225 \n"
" v_mul_f32 v226, v20, v226 \n"
" v_mul_f32 v227, v20, v227 \n"
" v_cvt_f32_i32 v228, v228 \n"
" v_cvt_f32_i32 v229, v229 \n"
" v_cvt_f32_i32 v230, v230 \n"
" v_cvt_f32_i32 v231, v231 \n"
" v_mul_f32 v228, v25, v228 \n"
" v_mul_f32 v229, v25, v229 \n"
" v_mul_f32 v230, v25, v230 \n"
" v_mul_f32 v231, v25, v231 \n"
" v_mul_f32 v228, v13, v228 row_newbcast:0 \n"
" v_mul_f32 v229, v13, v229 row_newbcast:1 \n"
" v_mul_f32 v230, v13, v230 row_newbcast:2 \n"
" v_mul_f32 v231, v13, v231 row_newbcast:3 \n"
" v_mul_f32 v228, v21, v228 \n"
" v_mul_f32 v229, v21, v229 \n"
" v_mul_f32 v230, v21, v230 \n"
" v_mul_f32 v231, v21, v231 \n"
" v_cvt_f32_i32 v232, v232 \n"
" v_cvt_f32_i32 v233, v233 \n"
" v_cvt_f32_i32 v234, v234 \n"
" v_cvt_f32_i32 v235, v235 \n"
" v_mul_f32 v232, v24, v232 \n"
" v_mul_f32 v233, v24, v233 \n"
" v_mul_f32 v234, v24, v234 \n"
" v_mul_f32 v235, v24, v235 \n"
" v_mul_f32 v232, v13, v232 row_newbcast:4 \n"
" v_mul_f32 v233, v13, v233 row_newbcast:5 \n"
" v_mul_f32 v234, v13, v234 row_newbcast:6 \n"
" v_mul_f32 v235, v13, v235 row_newbcast:7 \n"
" v_mul_f32 v232, v20, v232 \n"
" v_mul_f32 v233, v20, v233 \n"
" v_mul_f32 v234, v20, v234 \n"
" v_mul_f32 v235, v20, v235 \n"
" v_cvt_f32_i32 v236, v236 \n"
" v_cvt_f32_i32 v237, v237 \n"
" v_cvt_f32_i32 v238, v238 \n"
" v_cvt_f32_i32 v239, v239 \n"
" v_mul_f32 v236, v25, v236 \n"
" v_mul_f32 v237, v25, v237 \n"
" v_mul_f32 v238, v25, v238 \n"
" v_mul_f32 v239, v25, v239 \n"
" v_mul_f32 v236, v13, v236 row_newbcast:4 \n"
" v_mul_f32 v237, v13, v237 row_newbcast:5 \n"
" v_mul_f32 v238, v13, v238 row_newbcast:6 \n"
" v_mul_f32 v239, v13, v239 row_newbcast:7 \n"
" v_mul_f32 v236, v21, v236 \n"
" v_mul_f32 v237, v21, v237 \n"
" v_mul_f32 v238, v21, v238 \n"
" v_mul_f32 v239, v21, v239 \n"
" v_cvt_f32_i32 v240, v240 \n"
" v_cvt_f32_i32 v241, v241 \n"
" v_cvt_f32_i32 v242, v242 \n"
" v_cvt_f32_i32 v243, v243 \n"
" v_mul_f32 v240, v24, v240 \n"
" v_mul_f32 v241, v24, v241 \n"
" v_mul_f32 v242, v24, v242 \n"
" v_mul_f32 v243, v24, v243 \n"
" v_mul_f32 v240, v13, v240 row_newbcast:8 \n"
" v_mul_f32 v241, v13, v241 row_newbcast:9 \n"
" v_mul_f32 v242, v13, v242 row_newbcast:10 \n"
" v_mul_f32 v243, v13, v243 row_newbcast:11 \n"
" v_mul_f32 v240, v20, v240 \n"
" v_mul_f32 v241, v20, v241 \n"
" v_mul_f32 v242, v20, v242 \n"
" v_mul_f32 v243, v20, v243 \n"
" v_cvt_f32_i32 v244, v244 \n"
" v_cvt_f32_i32 v245, v245 \n"
" v_cvt_f32_i32 v246, v246 \n"
" v_cvt_f32_i32 v247, v247 \n"
" v_mul_f32 v244, v25, v244 \n"
" v_mul_f32 v245, v25, v245 \n"
" v_mul_f32 v246, v25, v246 \n"
" v_mul_f32 v247, v25, v247 \n"
" v_mul_f32 v244, v13, v244 row_newbcast:8 \n"
" v_mul_f32 v245, v13, v245 row_newbcast:9 \n"
" v_mul_f32 v246, v13, v246 row_newbcast:10 \n"
" v_mul_f32 v247, v13, v247 row_newbcast:11 \n"
" v_mul_f32 v244, v21, v244 \n"
" v_mul_f32 v245, v21, v245 \n"
" v_mul_f32 v246, v21, v246 \n"
" v_mul_f32 v247, v21, v247 \n"
" v_cvt_f32_i32 v248, v248 \n"
" v_cvt_f32_i32 v249, v249 \n"
" v_cvt_f32_i32 v250, v250 \n"
" v_cvt_f32_i32 v251, v251 \n"
" v_mul_f32 v248, v24, v248 \n"
" v_mul_f32 v249, v24, v249 \n"
" v_mul_f32 v250, v24, v250 \n"
" v_mul_f32 v251, v24, v251 \n"
" v_mul_f32 v248, v13, v248 row_newbcast:12 \n"
" v_mul_f32 v249, v13, v249 row_newbcast:13 \n"
" v_mul_f32 v250, v13, v250 row_newbcast:14 \n"
" v_mul_f32 v251, v13, v251 row_newbcast:15 \n"
" v_mul_f32 v248, v20, v248 \n"
" v_mul_f32 v249, v20, v249 \n"
" v_mul_f32 v250, v20, v250 \n"
" v_mul_f32 v251, v20, v251 \n"
" v_cvt_f32_i32 v252, v252 \n"
" v_cvt_f32_i32 v253, v253 \n"
" v_cvt_f32_i32 v254, v254 \n"
" v_cvt_f32_i32 v255, v255 \n"
" v_mul_f32 v252, v25, v252 \n"
" v_mul_f32 v253, v25, v253 \n"
" v_mul_f32 v254, v25, v254 \n"
" v_mul_f32 v255, v25, v255 \n"
" v_mul_f32 v252, v13, v252 row_newbcast:12 \n"
" v_mul_f32 v253, v13, v253 row_newbcast:13 \n"
" v_mul_f32 v254, v13, v254 row_newbcast:14 \n"
" v_mul_f32 v255, v13, v255 row_newbcast:15 \n"
" v_mul_f32 v252, v21, v252 \n"
" v_mul_f32 v253, v21, v253 \n"
" v_mul_f32 v254, v21, v254 \n"
" v_mul_f32 v255, v21, v255 \n"
" v_cmp_u_f32 s[48:49], v224, v224 \n"
" v_add3_u32 v50, v224, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v225, v225 \n"
" v_add3_u32 v50, v225, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v224, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v226, v226 \n"
" v_add3_u32 v50, v226, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v227, v227 \n"
" v_add3_u32 v50, v227, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v225, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v228, v228 \n"
" v_add3_u32 v50, v228, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v229, v229 \n"
" v_add3_u32 v50, v229, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v226, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v230, v230 \n"
" v_add3_u32 v50, v230, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v231, v231 \n"
" v_add3_u32 v50, v231, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v227, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v232, v232 \n"
" v_add3_u32 v50, v232, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v233, v233 \n"
" v_add3_u32 v50, v233, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v228, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v234, v234 \n"
" v_add3_u32 v50, v234, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v235, v235 \n"
" v_add3_u32 v50, v235, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v229, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v236, v236 \n"
" v_add3_u32 v50, v236, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v237, v237 \n"
" v_add3_u32 v50, v237, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v230, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v238, v238 \n"
" v_add3_u32 v50, v238, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v239, v239 \n"
" v_add3_u32 v50, v239, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v231, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v240, v240 \n"
" v_add3_u32 v50, v240, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v241, v241 \n"
" v_add3_u32 v50, v241, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v232, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v242, v242 \n"
" v_add3_u32 v50, v242, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v243, v243 \n"
" v_add3_u32 v50, v243, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v233, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v244, v244 \n"
" v_add3_u32 v50, v244, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v245, v245 \n"
" v_add3_u32 v50, v245, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v234, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v246, v246 \n"
" v_add3_u32 v50, v246, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v247, v247 \n"
" v_add3_u32 v50, v247, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v235, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v248, v248 \n"
" v_add3_u32 v50, v248, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v249, v249 \n"
" v_add3_u32 v50, v249, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v236, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v250, v250 \n"
" v_add3_u32 v50, v250, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v251, v251 \n"
" v_add3_u32 v50, v251, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v237, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v252, v252 \n"
" v_add3_u32 v50, v252, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v253, v253 \n"
" v_add3_u32 v50, v253, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v238, v55, v54, s52 \n"
" v_cmp_u_f32 s[48:49], v254, v254 \n"
" v_add3_u32 v50, v254, v53, 1 \n"
" v_cndmask_b32 v54, v50, v52, s[48:49] \n"
" v_cmp_u_f32 s[48:49], v255, v255 \n"
" v_add3_u32 v50, v255, v53, 1 \n"
" v_cndmask_b32 v55, v50, v52, s[48:49] \n"
" v_perm_b32 v239, v55, v54, s52 \n"
" ds_write_b64 v3, v[224:225] offset:35072 \n"
" ds_write_b64 v3, v[226:227] offset:43776 \n"
" ds_write_b64 v3, v[228:229] offset:37248 \n"
" ds_write_b64 v3, v[230:231] offset:45952 \n"
" ds_write_b64 v3, v[232:233] offset:39424 \n"
" ds_write_b64 v3, v[234:235] offset:48128 \n"
" ds_write_b64 v3, v[236:237] offset:41600 \n"
" ds_write_b64 v3, v[238:239] offset:50304 \n"
" s_waitcnt lgkmcnt(0) \n"
" s_barrier \n"
" ds_read_b32 v64, v4 offset:35072 \n"
" ds_read_b32 v65, v4 offset:39424 \n"
" ds_read_b32 v66, v4 offset:35104 \n"
" ds_read_b32 v67, v4 offset:39456 \n"
" ds_read_b32 v68, v4 offset:35136 \n"
" ds_read_b32 v69, v4 offset:39488 \n"
" ds_read_b32 v70, v4 offset:35168 \n"
" ds_read_b32 v71, v4 offset:39520 \n"
" ds_read_b32 v72, v4 offset:43776 \n"
" ds_read_b32 v73, v4 offset:48128 \n"
" ds_read_b32 v74, v4 offset:43808 \n"
" ds_read_b32 v75, v4 offset:48160 \n"
" ds_read_b32 v76, v4 offset:43840 \n"
" ds_read_b32 v77, v4 offset:48192 \n"
" ds_read_b32 v78, v4 offset:43872 \n"
" ds_read_b32 v79, v4 offset:48224 \n"
" s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, s[20:21] \n"
" global_atomic_pk_add_bf16 v80, v64, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[20:21] \n"
" global_atomic_pk_add_bf16 v80, v65, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[22:23] \n"
" global_atomic_pk_add_bf16 v82, v66, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[22:23] \n"
" global_atomic_pk_add_bf16 v82, v67, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 v84, v68, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[24:25] \n"
" global_atomic_pk_add_bf16 v84, v69, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 v86, v70, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[26:27] \n"
" global_atomic_pk_add_bf16 v86, v71, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 v88, v72, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[28:29] \n"
" global_atomic_pk_add_bf16 v88, v73, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 v90, v74, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[30:31] \n"
" global_atomic_pk_add_bf16 v90, v75, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 v92, v76, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[32:33] \n"
" global_atomic_pk_add_bf16 v92, v77, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 v94, v78, s[8:9] \n"
" s_mov_b64 exec, s[36:37] \n"
" s_mov_b64 exec, s[34:35] \n"
" global_atomic_pk_add_bf16 v94, v79, s[8:9] inst_offset:256\n"
" s_mov_b64 exec, s[36:37] \n"
" s_add_u32 s8, s59, s8 \n"
" s_addc_u32 s9, 0, s9 \n"
" s_addk_i32 s80, 0x0100 \n"
" s_cmp_lt_i32 s80, s81 \n"
" s_cbranch_scc0 label_2301 \n"
" s_branch label_0C3C \n"
" label_2301: \n"
" s_waitcnt 0x0000 \n"
" s_endpgm \n"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
# define _DEQUAN_CVT_(a, b, c) \ #ifndef CK_TILE_FLATMM_UK_MFMA
" v_cvt_f32_i32 a[0], a[0] \n" \ #define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
" v_cvt_f32_i32 a[1], a[1] \n" \ #endif
" v_cvt_f32_i32 a[2], a[2] \n" \
" v_cvt_f32_i32 a[3], a[3] \n" \
" v_mul_f32 a[0], v15, a[0] \n" \
" v_mul_f32 a[1], v15, a[1] \n" \
" v_mul_f32 a[2], v15, a[2] \n" \
" v_mul_f32 a[3], v15, a[3] \n" \
" v_mul_f32 a[0], v17, a[0] row_newbcast:12 \n" \
" v_mul_f32 a[1], v17, a[1] row_newbcast:13 \n" \
" v_mul_f32 a[2], v17, a[2] row_newbcast:14 \n" \
" v_mul_f32 a[3], v17, a[3] row_newbcast:15 \n" \
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_INT8
# define _UK_MFMA_ "v_mfma_i32_16x16x32_i8"
#endif
# define _DEQUAN_CVT_(a0,a1,a2,a3, b, c) \
" v_cvt_f32_i32 a0, a0 \n" \
" v_cvt_f32_i32 a1, a1 \n" \
" v_cvt_f32_i32 a2, a2 \n" \
" v_cvt_f32_i32 a3, a3 \n" \
" v_mul_f32 a0, v15, a0 \n" \
" v_mul_f32 a1, v15, a1 \n" \
" v_mul_f32 a2, v15, a2 \n" \
" v_mul_f32 a3, v15, a3 \n" \
" v_mul_f32 a0, v17, a0 row_newbcast:12 \n" \
" v_mul_f32 a1, v17, a1 row_newbcast:13 \n" \
" v_mul_f32 a2, v17, a2 row_newbcast:14 \n" \
" v_mul_f32 a3, v17, a3 row_newbcast:15 \n" \
";-------------------------------\n"
"s_mov_b32 s28, %[s_res_aq0] \n"
"s_mov_b32 s29, %[s_res_aq1] \n"
"s_mov_b32 s30, %[s_res_aq2] \n"
"s_mov_b32 s31, %[s_res_aq3] \n"
"s_mov_b32 s16, %[s_res_dq0] \n" "s_mov_b32 s16, %[s_res_dq0] \n"
"s_mov_b32 s17, %[s_res_dq1] \n" "s_mov_b32 s17, %[s_res_dq1] \n"
"s_mov_b32 s18, %[s_res_dq2] \n" "s_mov_b32 s18, %[s_res_dq2] \n"
...@@ -32,19 +43,7 @@ ...@@ -32,19 +43,7 @@
"s_mov_b32 s25, %[s_res_b1] \n" "s_mov_b32 s25, %[s_res_b1] \n"
"s_mov_b32 s26, %[s_res_b2] \n" "s_mov_b32 s26, %[s_res_b2] \n"
"s_mov_b32 s27, %[s_res_b3] \n" "s_mov_b32 s27, %[s_res_b3] \n"
";---------------------------------------------- \n"
//////////GQ/DQ/GsmQ_addr///////////////
//expert weight addr no need
// s_mul_i32 s60, s3, 32 // 00000000056C: 923CA003 s3 s_tg_idy
// s_mul_i32 s60, 4, s60 // 000000000570: 923C3C84
// s_add_u32 s40, s60, s40 // 000000000574: 8028283C s40 sw_ptr
// s_addc_u32 s41, 0, s41 // 000000000578: 82292980 s41 sw_ptr
// v_and_b32 v54, 15, v0 // 00000000057C: 266C008F
// v_lshlrev_b32 v8, 2, v54 // 000000000580: 24106C82 v8/9 w addr
// v_add_u32 v9, 64, v8 // 000000000584: 681210C0
//GQDQ addr function kkkkkkkkkkkkkk
" v_lshrrev_b32 v54, 4, v0 \n" " v_lshrrev_b32 v54, 4, v0 \n"
" v_lshlrev_b32 v55, 2, v54 \n" " v_lshlrev_b32 v55, 2, v54 \n"
" v_and_b32 v54, 15, v0 \n" " v_and_b32 v54, 15, v0 \n"
...@@ -55,21 +54,17 @@ ...@@ -55,21 +54,17 @@
" v_add_u32 v55, v54, v55 \n" " v_add_u32 v55, v54, v55 \n"
" v_lshlrev_b32 v10, 2, v55 \n" " v_lshlrev_b32 v10, 2, v55 \n"
" v_add_u32 v11, 0x00000400, v10 \n" " v_add_u32 v11, 0x00000400, v10 \n"
" s_mul_i32 s60, %[s_wave_id], 16 \n" " s_mul_i32 s60, %[s_wave_id], 16 \n"
" s_mul_i32 s60, s60, 4 \n" " s_mul_i32 s60, s60, 4 \n"
" v_add_u32 v10, s60, v10 \n" " v_add_u32 v10, s60, v10 \n"
" v_add_u32 v11, s60, v11 \n" " v_add_u32 v11, s60, v11 \n"
" v_mov_b32 v5, v10 \n" " v_mov_b32 v5, v10 \n"
";---------------------------------------------- \n"
//////////////////////////////
" s_mov_b32 s57, 0x00000100 \n" " s_mov_b32 s57, 0x00000100 \n"
" s_mov_b32 s58, 0x00001000 \n" " s_mov_b32 s58, 0x00001000 \n"
" s_mov_b32 s79, 0x00000400 \n" " s_mov_b32 s79, 0x00000400 \n"
" s_mov_b32 s59, 0x00000200 \n" " s_mov_b32 s59, 0x00000200 \n"
//////// ";---------------------------------------------- \n"
//" s_mul_i32 s60, s70, 0x00000100 \n"
//" s_sub_u32 s56, s60, 0x00001000 \n"
///////////////
" s_mov_b32 s78, 0x00001000 \n" " s_mov_b32 s78, 0x00001000 \n"
" s_mov_b32 s52, 0x07060302 \n" " s_mov_b32 s52, 0x07060302 \n"
" s_mov_b32 s53, 0x00000400 \n" " s_mov_b32 s53, 0x00000400 \n"
...@@ -82,7 +77,7 @@ ...@@ -82,7 +77,7 @@
" v_mov_b32 v52, 0x7fff0000 \n" " v_mov_b32 v52, 0x7fff0000 \n"
" v_mov_b32 v53, 0x00007fff \n" " v_mov_b32 v53, 0x00007fff \n"
" s_waitcnt 0x0000 \n" " s_waitcnt 0x0000 \n"
///XQ ADDR, fake token id ";---------------------------------------------- \n"
" v_mov_b32 %[v_token_id], %[v_token_id] \n" " v_mov_b32 %[v_token_id], %[v_token_id] \n"
" v_lshrrev_b32 v54, 24, %[v_token_id] \n" " v_lshrrev_b32 v54, 24, %[v_token_id] \n"
" v_mul_i32_i24 v54, s66, v54 \n" " v_mul_i32_i24 v54, s66, v54 \n"
...@@ -104,8 +99,7 @@ ...@@ -104,8 +99,7 @@
" buffer_load_dword v21, v9, s[40:43], 0 offen \n" " buffer_load_dword v21, v9, s[40:43], 0 offen \n"
" s_mov_b32 s80, 0 \n" " s_mov_b32 s80, 0 \n"
//---------------------v26-33 no need ";---------------------------------------------- \n"
// "s_nop 4\n"
"; -- prefetch A0\n" "; -- prefetch A0\n"
"s_add_u32 m0, 0, %[s_m0_init] \n" "s_add_u32 m0, 0, %[s_m0_init] \n"
"buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds \n" "buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds \n"
...@@ -183,18 +177,17 @@ ...@@ -183,18 +177,17 @@
" s_waitcnt vmcnt(40) \n" " s_waitcnt vmcnt(40) \n"
" s_barrier \n" " s_barrier \n"
/////////////////////////////// ";---------------------------------------------- \n"
"ds_read_b128 v[192:195], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n" // 1024: N stride, 64 K stride "ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0]\n" // 1024: N stride, 64 K stride
"ds_read_b128 v[196:199], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n" "ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1]\n"
"ds_read_b128 v[200:203], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n" "ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2]\n"
"ds_read_b128 v[204:207], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n" "ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3]\n"
"ds_read_b128 v[208:211], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n" "ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4]\n"
"ds_read_b128 v[212:215], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n" "ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5]\n"
"ds_read_b128 v[216:219], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n" "ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6]\n"
"ds_read_b128 v[220:223], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n" "ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7]\n"
//////////////////////////// ";---------------------------------------------- \n"
" label_start: \n"
"label_start:
" s_waitcnt vmcnt(24) & lgkmcnt(0) \n" " s_waitcnt vmcnt(24) & lgkmcnt(0) \n"
" s_barrier \n" " s_barrier \n"
_UK_MFMA_ " v[128:131], acc[0:1], v[192:193], v[128:131] \n" _UK_MFMA_ " v[128:131], acc[0:1], v[192:193], v[128:131] \n"
...@@ -400,7 +393,7 @@ ...@@ -400,7 +393,7 @@
" s_waitcnt vmcnt(24) & lgkmcnt(0) \n" " s_waitcnt vmcnt(24) & lgkmcnt(0) \n"
" s_barrier \n" " s_barrier \n"
_UK_MFMA_ " v[128:131], acc[128:129], v[224:225], v[128:131] \n" _UK_MFMA_ " v[128:131], acc[128:129], v[224:225], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[130:131], v[226:227], v[128:131] \n" _UK_MFMA_ " v[128:131], acc[130:131], v[226:227], v[128:131] \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n"
_UK_MFMA_ " v[128:131], acc[132:133], v[228:229], v[128:131] \n" _UK_MFMA_ " v[128:131], acc[132:133], v[228:229], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[134:135], v[230:231], v[128:131] \n" _UK_MFMA_ " v[128:131], acc[134:135], v[230:231], v[128:131] \n"
...@@ -461,49 +454,49 @@ ...@@ -461,49 +454,49 @@
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n"
_UK_MFMA_ " v[144:147], acc[164:165], v[228:229], v[144:147] \n" _UK_MFMA_ " v[144:147], acc[164:165], v[228:229], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[166:167], v[230:231], v[144:147] \n" _UK_MFMA_ " v[144:147], acc[166:167], v[230:231], v[144:147] \n"
" ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0] " ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0] \n"
_UK_MFMA_ " v[144:147], acc[168:169], v[232:233], v[144:147] \n" _UK_MFMA_ " v[144:147], acc[168:169], v[232:233], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[170:171], v[234:235], v[144:147] \n" _UK_MFMA_ " v[144:147], acc[170:171], v[234:235], v[144:147] \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[144:147], acc[172:173], v[236:237], v[144:147] \n" _UK_MFMA_ " v[144:147], acc[172:173], v[236:237], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[174:175], v[238:239], v[144:147] \n" _UK_MFMA_ " v[144:147], acc[174:175], v[238:239], v[144:147] \n"
" ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1] " ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1] \n"
_UK_MFMA_ " v[148:151], acc[160:161], v[240:241], v[148:151] \n" _UK_MFMA_ " v[148:151], acc[160:161], v[240:241], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[162:163], v[242:243], v[148:151] \n" _UK_MFMA_ " v[148:151], acc[162:163], v[242:243], v[148:151] \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[148:151], acc[164:165], v[244:245], v[148:151] \n" _UK_MFMA_ " v[148:151], acc[164:165], v[244:245], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[166:167], v[246:247], v[148:151] \n" _UK_MFMA_ " v[148:151], acc[166:167], v[246:247], v[148:151] \n"
" ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2] " ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2] \n"
_UK_MFMA_ " v[148:151], acc[168:169], v[248:249], v[148:151] \n" _UK_MFMA_ " v[148:151], acc[168:169], v[248:249], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[170:171], v[250:251], v[148:151] \n" _UK_MFMA_ " v[148:151], acc[170:171], v[250:251], v[148:151] \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[148:151], acc[172:173], v[252:253], v[148:151] \n" _UK_MFMA_ " v[148:151], acc[172:173], v[252:253], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[174:175], v[254:255], v[148:151] \n" _UK_MFMA_ " v[148:151], acc[174:175], v[254:255], v[148:151] \n"
" ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3] " ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3] \n"
_UK_MFMA_ " v[152:155], acc[176:177], v[224:225], v[152:155] \n" _UK_MFMA_ " v[152:155], acc[176:177], v[224:225], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[178:179], v[226:227], v[152:155] \n" _UK_MFMA_ " v[152:155], acc[178:179], v[226:227], v[152:155] \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n" " buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n"
_UK_MFMA_ " v[152:155], acc[180:181], v[228:229], v[152:155] \n" _UK_MFMA_ " v[152:155], acc[180:181], v[228:229], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[182:183], v[230:231], v[152:155] \n" _UK_MFMA_ " v[152:155], acc[182:183], v[230:231], v[152:155] \n"
" ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4] " ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4] \n"
_UK_MFMA_ " v[152:155], acc[184:185], v[232:233], v[152:155] \n" _UK_MFMA_ " v[152:155], acc[184:185], v[232:233], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[186:187], v[234:235], v[152:155] \n" _UK_MFMA_ " v[152:155], acc[186:187], v[234:235], v[152:155] \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n" " buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[152:155], acc[188:189], v[236:237], v[152:155] \n" _UK_MFMA_ " v[152:155], acc[188:189], v[236:237], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[190:191], v[238:239], v[152:155] \n" _UK_MFMA_ " v[152:155], acc[190:191], v[238:239], v[152:155] \n"
" ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5] " ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5] \n"
_UK_MFMA_ " v[156:159], acc[176:177], v[240:241], v[156:159] \n" _UK_MFMA_ " v[156:159], acc[176:177], v[240:241], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[178:179], v[242:243], v[156:159] \n" _UK_MFMA_ " v[156:159], acc[178:179], v[242:243], v[156:159] \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n" " buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[156:159], acc[180:181], v[244:245], v[156:159] \n" _UK_MFMA_ " v[156:159], acc[180:181], v[244:245], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[182:183], v[246:247], v[156:159] \n" _UK_MFMA_ " v[156:159], acc[182:183], v[246:247], v[156:159] \n"
" ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6] " ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6] \n"
_UK_MFMA_ " v[156:159], acc[184:185], v[248:249], v[156:159] \n" _UK_MFMA_ " v[156:159], acc[184:185], v[248:249], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[186:187], v[250:251], v[156:159] \n" _UK_MFMA_ " v[156:159], acc[186:187], v[250:251], v[156:159] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n" " buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[156:159], acc[188:189], v[252:253], v[156:159] \n" _UK_MFMA_ " v[156:159], acc[188:189], v[252:253], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[190:191], v[254:255], v[156:159] \n" _UK_MFMA_ " v[156:159], acc[190:191], v[254:255], v[156:159] \n"
" ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7] " ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7] \n"
" s_waitcnt vmcnt(32) \n" " s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[160:163], acc[192:193], v[224:225], v[160:163] \n" _UK_MFMA_ " v[160:163], acc[192:193], v[224:225], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[194:195], v[226:227], v[160:163] \n" _UK_MFMA_ " v[160:163], acc[194:195], v[226:227], v[160:163] \n"
...@@ -601,7 +594,7 @@ ...@@ -601,7 +594,7 @@
" s_cbranch_scc0 label_end \n" " s_cbranch_scc0 label_end \n"
" s_branch label_start%= \n" " s_branch label_start%= \n"
" label_end : \n" " label_end : \n"
//dequant ";---------------------------------------------- \n"
" v_cvt_f32_i32 v128, v128 \n" " v_cvt_f32_i32 v128, v128 \n"
" v_cvt_f32_i32 v129, v129 \n" " v_cvt_f32_i32 v129, v129 \n"
" v_cvt_f32_i32 v130, v130 \n" " v_cvt_f32_i32 v130, v130 \n"
...@@ -794,7 +787,7 @@ ...@@ -794,7 +787,7 @@
" v_mul_f32 v189, v17, v189 row_newbcast:13 \n" " v_mul_f32 v189, v17, v189 row_newbcast:13 \n"
" v_mul_f32 v190, v17, v190 row_newbcast:14 \n" " v_mul_f32 v190, v17, v190 row_newbcast:14 \n"
" v_mul_f32 v191, v17, v191 row_newbcast:15 \n" " v_mul_f32 v191, v17, v191 row_newbcast:15 \n"
#undef _UK_MFMA_ #undef _UK_MFMA_
//dequant end #undef _DEQUAN_CVT_
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp" #include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp" #include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp" #include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp" #include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp" #include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp"
#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp" #include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp"
......
...@@ -198,7 +198,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -198,7 +198,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
//addr in fact //addr in fact
auto a_coords = generate_tuple( auto a_coords = generate_tuple(
[&](auto i) { [&](auto i) {
return (token_id) * kargs.stride_token + return (token_id[i]) * kargs.stride_token +
threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA; threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA;
}, },
number<row_ids_a.size()>{}); number<row_ids_a.size()>{});
...@@ -254,7 +254,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -254,7 +254,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
make_tuple(shared_intermediate_size_1), make_tuple(shared_intermediate_size_1),
number<1>{}); number<1>{});
return g_view_; return gq_view_;
}(); }();
auto gq_res = gq_win.get_buffer_view().cached_buf_res_; auto gq_res = gq_win.get_buffer_view().cached_buf_res_;
...@@ -345,7 +345,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -345,7 +345,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto o_coords = generate_tuple( auto o_coords = generate_tuple(
[&](auto i) { [&](auto i) {
return token_id * kargs.stride_token + return token_id[i] * kargs.stride_token +
threadIdx.x % (BlockShape::Block_N1 / kAlignmentO) * kAlignmentO; threadIdx.x % (BlockShape::Block_N1 / kAlignmentO) * kAlignmentO;
}, },
number<row_ids_a.size()>{}); number<row_ids_a.size()>{});
...@@ -376,6 +376,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8 ...@@ -376,6 +376,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
row_ids_a,//fake token id, 2D index for X scale row_ids_a,//fake token id, 2D index for X scale
aq_res, aq_res,
gq_res, gq_res,
gq_res,
dq_res, dq_res,
a_res, a_res,
a_coords, a_coords,
......
...@@ -143,7 +143,7 @@ using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution = ...@@ -143,7 +143,7 @@ using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
// int8 // int8
using WarpGemmMfma_i32_16x16x64_int8_int8_CTransposed = using WarpGemmMfma_i32_16x16x64_int8_int8_CTransposed =
WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution< WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_> WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>,
2>>; 2>>;
} // namespace ck_tile } // namespace ck_tile
...@@ -655,7 +655,7 @@ struct WarpGemmAttributeMfmaImpl_i32_16x16x32_i8 ...@@ -655,7 +655,7 @@ struct WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
else else
{ {
#if defined(__gfx94__) #if defined(__gfx94__)
c_vec = __builtin_amdgcn_mfma_i32_16x16x32i8( c_vec = __builtin_amdgcn_mfma_i32_16x16x32_i8(
bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0); bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
#elif defined(__gfx908__) || defined(__gfx90a__) #elif defined(__gfx908__) || defined(__gfx90a__)
static_for<0, 8, 1>{}([&](auto k) { static_for<0, 8, 1>{}([&](auto k) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment