Commit a75f162b authored by coderfeli's avatar coderfeli
Browse files

add files

parent 59c05300
...@@ -33,20 +33,20 @@ struct fmoe_ // traits, ugly name, only used for internal ...@@ -33,20 +33,20 @@ struct fmoe_ // traits, ugly name, only used for internal
using YSmoothScaleDataType = ck_tile::remove_cvref_t<typename TypeConfig::YSmoothScaleDataType>; using YSmoothScaleDataType = ck_tile::remove_cvref_t<typename TypeConfig::YSmoothScaleDataType>;
using TopkWeightDataType = ck_tile::remove_cvref_t<typename TypeConfig::TopkWeightDataType>; using TopkWeightDataType = ck_tile::remove_cvref_t<typename TypeConfig::TopkWeightDataType>;
using IndexDataType = ck_tile::remove_cvref_t<typename TypeConfig::IndexDataType>; using IndexDataType = ck_tile::remove_cvref_t<typename TypeConfig::IndexDataType>;
// S<32, 1024, 128, 128>, S<1, 4, 1>, S<16, 16, 32>
static constexpr ck_tile::index_t BT_ = BlockTIle_::at(ck_tile::number<0>{}); // block token static constexpr ck_tile::index_t BT_ = BlockTIle_::at(ck_tile::number<0>{}); // block token
static constexpr ck_tile::index_t BI_ = static constexpr ck_tile::index_t BI_ =
BlockTIle_::at(ck_tile::number<1>{}); // block intermediate BlockTIle_::at(ck_tile::number<1>{}); // block intermediate
static constexpr ck_tile::index_t BH_ = BlockTIle_::at(ck_tile::number<2>{}); // block hidden static constexpr ck_tile::index_t BH_ = BlockTIle_::at(ck_tile::number<2>{}); // block hidden
static constexpr ck_tile::index_t BD_ = BlockTIle_::at(ck_tile::number<3>{}); // block down static constexpr ck_tile::index_t BD_ = BlockTIle_::at(ck_tile::number<3>{}); // block down
using BlockTile_0 = ck_tile::sequence<BT_, BI_ / (GateOnly_ ? 1 : 2), BH_>; using BlockTile_0 = ck_tile::sequence<BT_, BI_ / (GateOnly_ ? 1 : 2), BH_>; //32, 512, 128
using WarpPerBlock_0 = ck_tile::remove_cvref_t<WarpPerBlock_>; using WarpPerBlock_0 = ck_tile::remove_cvref_t<WarpPerBlock_>; // S<1, 4, 1>
using WarpTile_0 = ck_tile::remove_cvref_t<WarpTile_>; using WarpTile_0 = ck_tile::remove_cvref_t<WarpTile_>; // S<16, 16, 32>
using BlockTile_1 = ck_tile::sequence<BT_, BD_, BI_ / (GateOnly_ ? 1 : 2)>; using BlockTile_1 = ck_tile::sequence<BT_, BD_, BI_ / (GateOnly_ ? 1 : 2)>; // 32, 128, 512
using WarpPerBlock_1 = ck_tile::remove_cvref_t<WarpPerBlock_>; using WarpPerBlock_1 = ck_tile::remove_cvref_t<WarpPerBlock_>; /// S<1, 4, 1>
using WarpTile_1 = ck_tile::remove_cvref_t<WarpTile_>; using WarpTile_1 = ck_tile::remove_cvref_t<WarpTile_>; // S<16, 16, 32>
static constexpr ck_tile::index_t GateOnly = GateOnly_; static constexpr ck_tile::index_t GateOnly = GateOnly_;
static constexpr ck_tile::index_t FusedQuant = FusedQuant_; static constexpr ck_tile::index_t FusedQuant = FusedQuant_;
......
...@@ -285,7 +285,10 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -285,7 +285,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
} }
else else
{ {
topid_unique_gen<IndexDataType>(topk_ids_host.mData, tokens, topk, experts, 11913); for(int i = 0; i < static_cast<int>(topk_ids_host.mData.size()); i++) {
topk_ids_host.mData[i] = i % 4;
}
// topid_unique_gen<IndexDataType>(topk_ids_host.mData, tokens, topk, experts, 11913);
} }
// leave it here for future debug purpose // leave it here for future debug purpose
...@@ -442,6 +445,10 @@ bool run(const ck_tile::ArgParser& arg_parser) ...@@ -442,6 +445,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
topk, topk,
gate_only); gate_only);
sorted_token_ids_host.savetxt("sorted_token_ids_host.txt", "int");
sorted_expert_ids_host.savetxt("sorted_expert_ids_host.txt", "int");
num_sorted_tiles_host.savetxt("num_sorted_tiles_host.txt", "int");
auto o_dev = o_buf.ToHost<ODataType>(); auto o_dev = o_buf.ToHost<ODataType>();
// o_dev.savetxt("gpu-out.txt", "float"); // o_dev.savetxt("gpu-out.txt", "float");
auto [rtol, atol] = get_elimit<ADataType>(); auto [rtol, atol] = get_elimit<ADataType>();
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
namespace ck_tile {
// "S"tream update output along "N"
// A in smem, B load from global
// require 4 wave, occupancy=1c
struct FlatmmSn_32x128x512_1x4x1_16x16x32_BF16_itl : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
{
using BDataType = bf16_t;
using ODataType = bf16_t;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template <typename BRes,
typename BCoords,
typename ORes,
typename OCoords,
typename OFlags,
typename ScaleTensor>
CK_TILE_DEVICE auto
operator()(const BRes& res_b,
const BCoords& cached_coords_b,
const ORes& res_o,
const OCoords& cached_coords_o,
const OFlags& o_flags, // this should be in sgpr
CK_TILE_LDS_ADDR void* smem,
index_t n, // loop along n dim
const ScaleTensor& scale_,
index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
index_t tile_offset_o)
{
// static_assert(BCoords::size() == 8); // 8
static_assert(BCoords::size() == 4);
static_assert(OCoords::size() == 8);
const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
static_assert(ScaleTensor::size() == 2);
float s0 = scale_[number<0>{}];
float s1 = scale_[number<1>{}];
// index_t loop_cnt = n / Block_N;
register float v_c0 asm("v64");
register float v_c1 asm("v65");
register float v_c2 asm("v66");
register float v_c3 asm("v67");
register float v_c4 asm("v68");
register float v_c5 asm("v69");
register float v_c6 asm("v70");
register float v_c7 asm("v71");
register float v_c8 asm("v72");
register float v_c9 asm("v73");
register float v_c10 asm("v74");
register float v_c11 asm("v75");
register float v_c12 asm("v76");
register float v_c13 asm("v77");
register float v_c14 asm("v78");
register float v_c15 asm("v79");
register float v_c16 asm("v80");
register float v_c17 asm("v81");
register float v_c18 asm("v82");
register float v_c19 asm("v83");
register float v_c20 asm("v84");
register float v_c21 asm("v85");
register float v_c22 asm("v86");
register float v_c23 asm("v87");
register float v_c24 asm("v88");
register float v_c25 asm("v89");
register float v_c26 asm("v90");
register float v_c27 asm("v91");
register float v_c28 asm("v92");
register float v_c29 asm("v93");
register float v_c30 asm("v94");
register float v_c31 asm("v95");
int32_t nan_hi = 0x7fff0000;
int32_t nan_lo = 0x00007fff;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int lane_id = threadIdx.x % 64;
int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
sld_y_os *= 2;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
sfl_sst *= 2;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
sfl_sld *= 2;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem),
// [s_loop_cnt]"+s"(loop_cnt),
[s_loop_cnt]"+s"(n),
[c0]"+v" (v_c0),
[c1]"+v" (v_c1),
[c2]"+v" (v_c2),
[c3]"+v" (v_c3),
[c4]"+v" (v_c4),
[c5]"+v" (v_c5),
[c6]"+v" (v_c6),
[c7]"+v" (v_c7),
[c8]"+v" (v_c8),
[c9]"+v" (v_c9),
[c10]"+v"(v_c10),
[c11]"+v"(v_c11),
[c12]"+v"(v_c12),
[c13]"+v"(v_c13),
[c14]"+v"(v_c14),
[c15]"+v"(v_c15),
[c16]"+v"(v_c16),
[c17]"+v"(v_c17),
[c18]"+v"(v_c18),
[c19]"+v"(v_c19),
[c20]"+v"(v_c20),
[c21]"+v"(v_c21),
[c22]"+v"(v_c22),
[c23]"+v"(v_c23),
[c24]"+v"(v_c24),
[c25]"+v"(v_c25),
[c26]"+v"(v_c26),
[c27]"+v"(v_c27),
[c28]"+v"(v_c28),
[c29]"+v"(v_c29),
[c30]"+v"(v_c30),
[c31]"+v"(v_c31)
:
[sld_a_base]"n"(0),
[shfl_base]"n"(0),
[v_sld_y_os]"v"(sld_y_os),
[v_sfl_sld]"v"(sfl_sld),
[v_sfl_sst]"v"(sfl_sst),
[s_res_o0]"s"(res_o[0]),
[s_res_o1]"s"(res_o[1]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
[v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
[v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
[v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
[v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
[v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
[v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
[v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[s_tile_os_o]"s"(tile_stride_o_bytes),
[s_tile_os_b]"s"(tile_stride_b_bytes),
[scale_0]"v"(s0),
[scale_1]"v"(s1),
[v_nan_lo]"v"(nan_lo),
[v_nan_hi]"v"(nan_hi),
[s_execflag_0]"s"(o_flags[number<0>{}]),
[s_execflag_1]"s"(o_flags[number<1>{}]),
[s_execflag_2]"s"(o_flags[number<2>{}]),
[s_execflag_3]"s"(o_flags[number<3>{}]),
[s_execflag_4]"s"(o_flags[number<4>{}]),
[s_execflag_5]"s"(o_flags[number<5>{}]),
[s_execflag_6]"s"(o_flags[number<6>{}]),
[s_execflag_7]"s"(o_flags[number<7>{}])
:
"memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255",
"s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
"s36", "s37","s59","s80",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v50", "v54", "v55",
"v64","v65","v66","v67","v68","v69","v70","v71",
"v72","v73","v74","v75","v76","v77","v78","v79",
"v80","v81","v82","v83","v84","v85","v86","v87",
"v88","v89","v90","v91","v92","v93","v94","v95",
"v128", "v129", "v130", "v131",
"v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
"v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
"v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
"v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
"v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
"v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
"v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
"v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
"v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
"v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
"v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
"v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
"v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
"v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
"v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
"v252", "v253", "v254", "v255"
);
#pragma clang diagnostic pop
// clang-format on
}
};
struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16_itl : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
{
using BDataType = bf16_t;
using ODataType = bf16_t;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template <typename BRes,
typename BCoords,
typename ORes,
typename OCoords,
typename OFlags,
typename ScaleTensor>
CK_TILE_DEVICE auto
operator()(const BRes& res_b,
const BCoords& cached_coords_b,
const ORes& res_o,
const OCoords& cached_coords_o,
const OFlags& o_flags, // this should be in sgpr
CK_TILE_LDS_ADDR void* smem,
index_t n, // loop along n dim
const ScaleTensor& scale_,
index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
index_t tile_offset_o)
{
static_assert(BCoords::size() == 8); // 8
static_assert(OCoords::size() == 8);
const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
static_assert(ScaleTensor::size() == 2);
float s0 = scale_[number<0>{}];
float s1 = scale_[number<1>{}];
// index_t loop_cnt = n / Block_N;
register float v_c0 asm("v64");
register float v_c1 asm("v65");
register float v_c2 asm("v66");
register float v_c3 asm("v67");
register float v_c4 asm("v68");
register float v_c5 asm("v69");
register float v_c6 asm("v70");
register float v_c7 asm("v71");
register float v_c8 asm("v72");
register float v_c9 asm("v73");
register float v_c10 asm("v74");
register float v_c11 asm("v75");
register float v_c12 asm("v76");
register float v_c13 asm("v77");
register float v_c14 asm("v78");
register float v_c15 asm("v79");
register float v_c16 asm("v80");
register float v_c17 asm("v81");
register float v_c18 asm("v82");
register float v_c19 asm("v83");
register float v_c20 asm("v84");
register float v_c21 asm("v85");
register float v_c22 asm("v86");
register float v_c23 asm("v87");
register float v_c24 asm("v88");
register float v_c25 asm("v89");
register float v_c26 asm("v90");
register float v_c27 asm("v91");
register float v_c28 asm("v92");
register float v_c29 asm("v93");
register float v_c30 asm("v94");
register float v_c31 asm("v95");
int32_t nan_hi = 0x7fff0000;
int32_t nan_lo = 0x00007fff;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int lane_id = threadIdx.x % 64;
int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
sld_y_os *= 2;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
sfl_sst *= 2;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
sfl_sld *= 2;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16_itl.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem),
[s_loop_cnt]"+s"(n),
[c0]"+v" (v_c0),
[c1]"+v" (v_c1),
[c2]"+v" (v_c2),
[c3]"+v" (v_c3),
[c4]"+v" (v_c4),
[c5]"+v" (v_c5),
[c6]"+v" (v_c6),
[c7]"+v" (v_c7),
[c8]"+v" (v_c8),
[c9]"+v" (v_c9),
[c10]"+v"(v_c10),
[c11]"+v"(v_c11),
[c12]"+v"(v_c12),
[c13]"+v"(v_c13),
[c14]"+v"(v_c14),
[c15]"+v"(v_c15),
[c16]"+v"(v_c16),
[c17]"+v"(v_c17),
[c18]"+v"(v_c18),
[c19]"+v"(v_c19),
[c20]"+v"(v_c20),
[c21]"+v"(v_c21),
[c22]"+v"(v_c22),
[c23]"+v"(v_c23),
[c24]"+v"(v_c24),
[c25]"+v"(v_c25),
[c26]"+v"(v_c26),
[c27]"+v"(v_c27),
[c28]"+v"(v_c28),
[c29]"+v"(v_c29),
[c30]"+v"(v_c30),
[c31]"+v"(v_c31)
:
[sld_a_base]"n"(0),
[shfl_base]"n"(0),
[v_sld_y_os]"v"(sld_y_os),
[v_sfl_sld]"v"(sfl_sld),
[v_sfl_sst]"v"(sfl_sst),
[s_res_o0]"s"(res_o[0]),
[s_res_o1]"s"(res_o[1]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
[v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
[v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
[v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
[v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
[v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
[v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
[v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[s_tile_os_o]"s"(tile_stride_o_bytes),
[s_tile_os_b]"s"(tile_stride_b_bytes),
[scale_0]"v"(s0),
[scale_1]"v"(s1),
[v_nan_lo]"v"(nan_lo),
[v_nan_hi]"v"(nan_hi),
[s_execflag_0]"s"(o_flags[number<0>{}]),
[s_execflag_1]"s"(o_flags[number<1>{}]),
[s_execflag_2]"s"(o_flags[number<2>{}]),
[s_execflag_3]"s"(o_flags[number<3>{}]),
[s_execflag_4]"s"(o_flags[number<4>{}]),
[s_execflag_5]"s"(o_flags[number<5>{}]),
[s_execflag_6]"s"(o_flags[number<6>{}]),
[s_execflag_7]"s"(o_flags[number<7>{}])
:
"memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255",
"s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
"s36", "s37","s59","s80",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v50", "v54", "v55",
"v64","v65","v66","v67","v68","v69","v70","v71",
"v72","v73","v74","v75","v76","v77","v78","v79",
"v80","v81","v82","v83","v84","v85","v86","v87",
"v88","v89","v90","v91","v92","v93","v94","v95",
"v128", "v129", "v130", "v131",
"v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
"v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
"v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
"v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
"v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
"v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
"v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
"v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
"v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
"v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
"v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
"v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
"v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
"v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
"v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
"v252", "v253", "v254", "v255"
);
#pragma clang diagnostic pop
// clang-format on
}
};
} // namespace ck_tile
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
#define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \
" v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
" v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \
" v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
" v_perm_b32 " y_ ", v55, v54, s52 \n"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
#define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, " x0_ " \n" \
" v_cvt_f16_f32 v55, " x1_ " \n" \
" v_pack_b32_f16 " y_ ", v54, v55 \n"
#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
";-------------------------------------------------------------\n"
" s_mov_b32 s52, 0x07060302 ; v_perm\n"
" s_mov_b64 s[38:39], exec ; save current exec\n"
" s_mov_b32 s8, %[s_res_o0] \n"
" s_mov_b32 s9, %[s_res_o1] \n"
" s_mov_b32 s12, %[s_res_b0] \n"
" s_mov_b32 s13, %[s_res_b1] \n"
" s_mov_b32 s14, %[s_res_b2] \n"
" s_mov_b32 s15, %[s_res_b3] \n"
" s_mov_b32 s59, 0 \n"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n"
// " ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
// " ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
// " ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
// " ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
// " ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
// " ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
// " ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
// " ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
// " ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
// " ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
// " ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
// " ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
// " ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
// " ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
// " ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
// " ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
// " ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n"
// " ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n"
// " ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n"
// " ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n"
// " ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n"
// " ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n"
// " ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n"
// " ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n"
// " ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n"
// " ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n"
// " ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n"
// " ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n"
// " ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n"
// " ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n"
// " ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n"
// " ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n"
// " ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n"
// " ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n"
// " ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n"
// " ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n"
// " ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n"
// " ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n"
// " ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n"
// " ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n"
// " ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n"
// " ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n"
// " ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n"
// " ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n"
// " ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n"
// " ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n"
// " ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n"
// " ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n"
" s_waitcnt 0 \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
// " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
// " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
// " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
// " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
// " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
// " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
// " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
// " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
// " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
// " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
// " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
// " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
// " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
// " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
// " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
// " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
" s_add_u32 s12, %[s_tile_os_b], s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" v_mov_b32 v64, 0 \n"
" v_mov_b32 v80, 0 \n"
" v_mov_b32 v65, 0 \n"
" v_mov_b32 v81, 0 \n"
" v_mov_b32 v66, 0 \n"
" v_mov_b32 v82, 0 \n"
" v_mov_b32 v67, 0 \n"
" v_mov_b32 v83, 0 \n"
" v_mov_b32 v68, 0 \n"
" v_mov_b32 v84, 0 \n"
" v_mov_b32 v69, 0 \n"
" v_mov_b32 v85, 0 \n"
" v_mov_b32 v70, 0 \n"
" v_mov_b32 v86, 0 \n"
" v_mov_b32 v71, 0 \n"
" v_mov_b32 v87, 0 \n"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:16640 \n"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:20992 \n"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:18816 \n"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:23168 \n"
" s_mov_b32 s80, 0 \n"
" s_waitcnt vmcnt(8) \n"
"coreloop_top_%=: \n"
" s_waitcnt vmcnt(0) & lgkmcnt(0) \n"
" s_barrier \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640 \n"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672 \n"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:25344 \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], v[64:67] \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen \n"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:29696 \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], v[64:67] \n"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704 \n"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736 \n"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:27520 \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], v[64:67] \n"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:31872 \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], v[64:67] \n"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992 \n"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024 \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], v[64:67] \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], v[64:67] \n"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056 \n"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088 \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], v[64:67] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], v[68:71] \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], v[68:71] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], v[68:71] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], v[68:71] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], v[68:71] \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], v[68:71] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], v[68:71] \n"
" s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, %[s_execflag_0] \n" _UK_ATOMIC_ADD_ " %[v_os_o0], v10, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], v[72:75] \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], v[72:75] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], v[72:75] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], v[72:75] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], v[72:75] \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], v[72:75] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], v[72:75] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], v[76:79] \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], v[76:79] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], v[76:79] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], v[76:79] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], v[76:79] \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], v[76:79] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], v[76:79] \n"
" s_mov_b64 exec, %[s_execflag_1] \n" _UK_ATOMIC_ADD_ " %[v_os_o1], v11, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_waitcnt vmcnt(0) \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], v[64:67] \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], v[64:67] \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], v[64:67] \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], v[64:67] \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], v[64:67] \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], v[64:67] \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], v[64:67] \n" _UK_MFMA_
" [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], v[64:67] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], v[68:71] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], v[68:71] \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], v[68:71] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], v[68:71] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], v[68:71] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], v[68:71] \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], v[68:71] \n" _UK_MFMA_
" [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], v[68:71] \n"
" s_mov_b64 exec, %[s_execflag_2] \n" _UK_ATOMIC_ADD_ " %[v_os_o2], v12, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], v[72:75] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], v[72:75] \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], v[72:75] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], v[72:75] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], v[72:75] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], v[72:75] \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], v[72:75] \n" _UK_MFMA_
" [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], v[72:75] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], v[76:79] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], v[76:79] \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], v[76:79] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], v[76:79] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], v[76:79] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], v[76:79] \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], v[76:79] \n" _UK_MFMA_
" [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], v[76:79] \n"
" s_mov_b64 exec, %[s_execflag_3] \n" _UK_ATOMIC_ADD_ " %[v_os_o3], v13, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_waitcnt vmcnt(0) \n"
// _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], v[64:67] \n"
// " buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], v[64:67] \n"
// " buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], v[64:67] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], v[68:71] \n"
// " buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], v[68:71] \n"
// " buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], v[68:71] \n"
" s_mov_b64 exec, %[s_execflag_4] \n" _UK_ATOMIC_ADD_ " %[v_os_o4], v14, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
// _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], v[72:75] \n"
// " buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], v[72:75] \n"
// " buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], v[72:75] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], v[76:79] \n"
// " buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], v[76:79] \n"
// " buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], v[76:79] \n"
" s_mov_b64 exec, %[s_execflag_5] \n" _UK_ATOMIC_ADD_ " %[v_os_o5], v15, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_waitcnt vmcnt(0) \n"
// _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], v[64:67] \n"
// " buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], v[64:67] \n"
// " buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], v[64:67] \n" _UK_MFMA_
// " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], v[64:67] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], v[68:71] \n"
// " buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], v[68:71] \n"
// " buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], v[68:71] \n" _UK_MFMA_
// " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], v[68:71] \n"
" s_mov_b64 exec, %[s_execflag_6] \n" _UK_ATOMIC_ADD_ " %[v_os_o6], v16, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
// _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], v[72:75] \n"
// " buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], v[72:75] \n"
// " buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], v[72:75] \n" _UK_MFMA_
// " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], v[72:75] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], v[76:79] \n"
// " buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], v[76:79] \n"
// " buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], v[76:79] \n" _UK_MFMA_
// " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], v[76:79] \n"
" s_mov_b64 exec, %[s_execflag_7] \n" _UK_ATOMIC_ADD_ " %[v_os_o7], v17, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_add_u32 s60, 0x00000100, s80 \n"
" s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
" s_cselect_b32 s56, %[s_tile_os_b], 0 \n"
" s_add_u32 s12, s56, s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_cmp_ge_u32 s80, 0x00000100 \n"
" s_cselect_b32 s59, %[s_tile_os_o], s59 \n"
" s_add_u32 s8, s59, s8 \n"
" s_addc_u32 s9, 0, s9 \n"
" v_mul_f32 %[c0], %[scale_0], %[c0] \n"
" v_mul_f32 %[c1], %[scale_0], %[c1] \n"
" v_mul_f32 %[c2], %[scale_0], %[c2] \n"
" v_mul_f32 %[c3], %[scale_0], %[c3] \n"
" v_mul_f32 %[c4], %[scale_1], %[c4] \n"
" v_mul_f32 %[c5], %[scale_1], %[c5] \n"
" v_mul_f32 %[c6], %[scale_1], %[c6] \n"
" v_mul_f32 %[c7], %[scale_1], %[c7] \n"
" v_mul_f32 %[c8], %[scale_0], %[c8] \n"
" v_mul_f32 %[c9], %[scale_0], %[c9] \n"
" v_mul_f32 %[c10], %[scale_0], %[c10] \n"
" v_mul_f32 %[c11], %[scale_0], %[c11] \n"
" v_mul_f32 %[c12], %[scale_1], %[c12] \n"
" v_mul_f32 %[c13], %[scale_1], %[c13] \n"
" v_mul_f32 %[c14], %[scale_1], %[c14] \n"
" v_mul_f32 %[c15], %[scale_1], %[c15] \n" _UK_PK_CVT_("%[c0]", "%[c1]", "%[c0]") _UK_PK_CVT_(
"%[c2]",
"%[c3]",
"%[c1]") _UK_PK_CVT_("%[c4]",
"%[c5]",
"%[c2]") _UK_PK_CVT_("%[c6]",
"%[c7]",
"%[c3]") _UK_PK_CVT_("%[c8]",
"%[c9]",
"%[c4]") _UK_PK_CVT_("%["
"c10]",
"%["
"c11]",
"%[c5]")
_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]") _UK_PK_CVT_(
"%[c14]",
"%[c15]",
"%[c7]") " s_addk_i32 s80, 0x0080 \n"
" s_cmp_lt_i32 s80, %[s_loop_cnt] \n"
" s_cbranch_scc0 loop_atomic_%= \n"
" s_waitcnt vmcnt(0) & lgkmcnt(0) \n"
" s_barrier \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[128:129], v[128:129], 0 \n"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344 \n"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376 \n"
" ds_write_b64 v3, v[64:65] offset:16640 \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[130:131], v[130:131], v[80:83] \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
" ds_write_b64 v3, v[66:67] offset:20992 \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[132:133], v[132:133], v[80:83] \n"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408 \n"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440 \n"
" ds_write_b64 v3, v[68:69] offset:18816 \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[134:135], v[134:135], v[80:83] \n"
" ds_write_b64 v3, v[70:71] offset:23168 \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[136:137], v[136:137], v[80:83] \n"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696 \n"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728 \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[138:139], v[138:139], v[80:83] \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 "
"\n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[140:141], v[140:141], v[80:83] \n"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760 \n"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792 \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[142:143], v[142:143], v[80:83] "
"\n" _UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[128:129], v[192:193], 0 \n" _UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[130:131], v[194:195], v[84:87] \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 "
"\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[132:133], v[196:197], "
"v[84:87] \n" _UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[134:135], v[198:199], v[84:87] "
"\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[136:137], v[200:201], "
"v[84:87] \n" _UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[138:139], v[202:203], v[84:87] \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 "
"\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[140:141], v[204:205], "
"v[84:87] \n" _UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[142:143], v[206:207], v[84:87] \n"
" s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, %[s_execflag_0] \n" _UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[144:145], v[128:129], 0 \n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[146:147], v[130:131], v[88:91] \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[148:149], v[132:133], v[88:91] "
"\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[150:151], v[134:135], "
"v[88:91] \n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[152:153], v[136:137], v[88:91] "
"\n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[154:155], v[138:139], v[88:91] \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 "
"\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[156:157], v[140:141], "
"v[88:91] \n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[158:159], v[142:143], v[88:91] "
"\n" _UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[144:145], v[192:193], 0 \n" _UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[146:147], v[194:195], v[92:95] \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 "
"\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[148:149], v[196:197], "
"v[92:95] \n" _UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[150:151], v[198:199], v[92:95] "
"\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[152:153], v[200:201], "
"v[92:95] \n" _UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[154:155], v[202:203], v[92:95] \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 "
"\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[156:157], v[204:205], "
"v[92:95] \n" _UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[158:159], v[206:207], v[92:95] \n"
" s_mov_b64 exec, %[s_execflag_1] \n" _UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_waitcnt vmcnt(0) \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[160:161], v[144:145], v[80:83] "
"\n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[162:163], v[146:147], v[80:83] \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[164:165], v[148:149], v[80:83] "
"\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[166:167], v[150:151], "
"v[80:83] \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[168:169], v[152:153], v[80:83] "
"\n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[170:171], v[154:155], v[80:83] \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 "
"\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[172:173], v[156:157], "
"v[80:83] \n" _UK_MFMA_
" [%[c16], %[c17], %[c18], %[c19]], acc[174:175], v[158:159], v[80:83] "
"\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[160:161], v[208:209], "
"v[84:87] \n" _UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[162:163], v[210:211], v[84:87] \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 "
"\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[164:165], v[212:213], "
"v[84:87] \n" _UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[166:167], v[214:215], v[84:87] "
"\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[168:169], v[216:217], "
"v[84:87] \n" _UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[170:171], v[218:219], v[84:87] \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 "
"\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[172:173], v[220:221], "
"v[84:87] \n" _UK_MFMA_
" [%[c20], %[c21], %[c22], %[c23]], acc[174:175], v[222:223], v[84:87] \n"
" s_mov_b64 exec, %[s_execflag_2] \n" _UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[176:177], v[144:145], v[88:91] "
"\n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[178:179], v[146:147], v[88:91] \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[180:181], v[148:149], v[88:91] "
"\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[182:183], v[150:151], "
"v[88:91] \n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[184:185], v[152:153], v[88:91] "
"\n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[186:187], v[154:155], v[88:91] \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 "
"\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[188:189], v[156:157], "
"v[88:91] \n" _UK_MFMA_
" [%[c24], %[c25], %[c26], %[c27]], acc[190:191], v[158:159], v[88:91] "
"\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[176:177], v[208:209], "
"v[92:95] \n" _UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[178:179], v[210:211], v[92:95] \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 "
"\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[180:181], v[212:213], "
"v[92:95] \n" _UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[182:183], v[214:215], v[92:95] "
"\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[184:185], v[216:217], "
"v[92:95] \n" _UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[186:187], v[218:219], v[92:95] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 "
"\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[188:189], v[220:221], "
"v[92:95] \n" _UK_MFMA_
" [%[c28], %[c29], %[c30], %[c31]], acc[190:191], v[222:223], v[92:95] \n"
" s_mov_b64 exec, %[s_execflag_3] \n" _UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_waitcnt vmcnt(0) \n"
// _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[192:193], v[160:161], v[80:83] "
// "\n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[194:195], v[162:163], v[80:83] \n"
// " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[196:197], v[164:165], v[80:83] "
// "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[198:199], v[166:167], "
// "v[80:83] \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[200:201], v[168:169], v[80:83] "
// "\n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[202:203], v[170:171], v[80:83] \n"
// " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 "
// "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[204:205], v[172:173], "
// "v[80:83] \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[206:207], v[174:175], v[80:83] "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[192:193], v[224:225], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[194:195], v[226:227], v[84:87] \n"
// " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[196:197], v[228:229], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[198:199], v[230:231], v[84:87] "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[200:201], v[232:233], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[202:203], v[234:235], v[84:87] \n"
// " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[204:205], v[236:237], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[206:207], v[238:239], v[84:87] \n"
" s_mov_b64 exec, %[s_execflag_4] \n" _UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
// _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[208:209], v[160:161], v[88:91] "
// "\n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[210:211], v[162:163], v[88:91] \n"
// " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[212:213], v[164:165], v[88:91] "
// "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[214:215], v[166:167], "
// "v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[216:217], v[168:169], v[88:91] "
// "\n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[218:219], v[170:171], v[88:91] \n"
// " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 "
// "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[220:221], v[172:173], "
// "v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[222:223], v[174:175], v[88:91] "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[208:209], v[224:225], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[210:211], v[226:227], v[92:95] \n"
// " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[212:213], v[228:229], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[214:215], v[230:231], v[92:95] "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[216:217], v[232:233], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[218:219], v[234:235], v[92:95] \n"
// " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[220:221], v[236:237], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[222:223], v[238:239], v[92:95] \n"
" s_mov_b64 exec, %[s_execflag_5] \n" _UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_waitcnt vmcnt(0) \n"
// _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[224:225], v[176:177], v[80:83] "
// "\n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[226:227], v[178:179], v[80:83] \n"
// " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[228:229], v[180:181], v[80:83] "
// "\n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[230:231], v[182:183], "
// "v[80:83] \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[232:233], v[184:185], v[80:83] "
// "\n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[234:235], v[186:187], v[80:83] \n"
// " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen "
// "offset:1024 \n" _UK_MFMA_ " [%[c16], %[c17], %[c18], %[c19]], acc[236:237], "
// "v[188:189], v[80:83] \n" _UK_MFMA_
// " [%[c16], %[c17], %[c18], %[c19]], acc[238:239], v[190:191], v[80:83] "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[224:225], v[240:241], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[226:227], v[242:243], v[84:87] \n"
// " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen "
// "offset:2048 \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[228:229], "
// "v[244:245], v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[230:231], v[246:247], v[84:87] "
// "\n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[232:233], v[248:249], "
// "v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[234:235], v[250:251], v[84:87] \n"
// " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen "
// "offset:3072 \n" _UK_MFMA_ " [%[c20], %[c21], %[c22], %[c23]], acc[236:237], "
// "v[252:253], v[84:87] \n" _UK_MFMA_
// " [%[c20], %[c21], %[c22], %[c23]], acc[238:239], v[254:255], v[84:87] \n"
" s_mov_b64 exec, %[s_execflag_6] \n" _UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
// _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[240:241], v[176:177], v[88:91] "
// "\n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[242:243], v[178:179], v[88:91] \n"
// " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen "
// "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[244:245], v[180:181], "
// "v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[246:247], v[182:183], v[88:91] "
// "\n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[248:249], v[184:185], "
// "v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[250:251], v[186:187], v[88:91] \n"
// " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen "
// "offset:1024 \n" _UK_MFMA_ " [%[c24], %[c25], %[c26], %[c27]], acc[252:253], "
// "v[188:189], v[88:91] \n" _UK_MFMA_
// " [%[c24], %[c25], %[c26], %[c27]], acc[254:255], v[190:191], v[88:91] "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[240:241], v[240:241], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[242:243], v[242:243], v[92:95] \n"
// " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen "
// "offset:2048 \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[244:245], "
// "v[244:245], v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[246:247], v[246:247], v[92:95] "
// "\n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[248:249], v[248:249], "
// "v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[250:251], v[250:251], v[92:95] \n"
// " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen "
// "offset:3072 \n" _UK_MFMA_ " [%[c28], %[c29], %[c30], %[c31]], acc[252:253], "
// "v[252:253], v[92:95] \n" _UK_MFMA_
// " [%[c28], %[c29], %[c30], %[c31]], acc[254:255], v[254:255], v[92:95] \n"
" s_mov_b64 exec, %[s_execflag_7] \n" _UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_add_u32 s60, 0x00000100, s80 \n"
" s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
" s_cselect_b32 s56, s56, 0 \n"
" s_add_u32 s12, s56, s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_cmp_ge_u32 s80, 0x00000100 \n"
" s_cselect_b32 s59, 0x00000100, s59 \n"
" s_add_u32 s8, s59, s8 \n"
" s_addc_u32 s9, 0, s9 \n"
" v_mul_f32 %[c16], %[scale_0], %[c16] \n"
" v_mul_f32 %[c17], %[scale_0], %[c17] \n"
" v_mul_f32 %[c18], %[scale_0], %[c18] \n"
" v_mul_f32 %[c19], %[scale_0], %[c19] \n"
" v_mul_f32 %[c20], %[scale_1], %[c20] \n"
" v_mul_f32 %[c21], %[scale_1], %[c21] \n"
" v_mul_f32 %[c22], %[scale_1], %[c22] \n"
" v_mul_f32 %[c23], %[scale_1], %[c23] \n"
" v_mul_f32 %[c24], %[scale_0], %[c24] \n"
" v_mul_f32 %[c25], %[scale_0], %[c25] \n"
" v_mul_f32 %[c26], %[scale_0], %[c26] \n"
" v_mul_f32 %[c27], %[scale_0], %[c27] \n"
" v_mul_f32 %[c28], %[scale_1], %[c28] \n"
" v_mul_f32 %[c29], %[scale_1], %[c29] \n"
" v_mul_f32 %[c30], %[scale_1], %[c30] \n"
" v_mul_f32 %[c31], %[scale_1], %[c31] \n" _UK_PK_CVT_(
"%[c16]", "%[c17]", "%[c16]") _UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]")
_UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]") _UK_PK_CVT_(
"%[c22]", "%[c23]", "%[c19]") _UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]")
_UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]")
_UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]") _UK_PK_CVT_(
"%[c30]",
"%[c31]",
"%[c23]") " s_addk_i32 s80, 0x0080 \n"
" s_cmp_lt_i32 s80, %[s_loop_cnt] \n"
" s_cbranch_scc0 loop_atomic_%= \n"
" s_branch coreloop_top_%= \n"
" loop_atomic_%=: \n"
" s_waitcnt lgkmcnt(0) \n"
" s_barrier \n"
" ds_read_b32 v10, %[v_sfl_sld] offset:16640 \n"
" ds_read_b32 v11, %[v_sfl_sld] offset:16672 \n"
" ds_read_b32 v12, %[v_sfl_sld] offset:16704 \n"
" ds_read_b32 v13, %[v_sfl_sld] offset:16736 \n"
" ds_read_b32 v14, %[v_sfl_sld] offset:20992 \n"
" ds_read_b32 v15, %[v_sfl_sld] offset:21024 \n"
" ds_read_b32 v16, %[v_sfl_sld] offset:21056 \n"
" ds_read_b32 v17, %[v_sfl_sld] offset:21088 \n"
" s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, %[s_execflag_0] \n" _UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_1] \n" _UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_2] \n" _UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_3] \n" _UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_4] \n" _UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_5] \n" _UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_6] \n" _UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_7] \n" _UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9] \n"
" s_mov_b64 exec, s[38:39] "
" \n"
" s_add_u32 s8, s59, s8 \n"
" s_addc_u32 s9, 0, s9 \n"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] "
"offset:25344 \n"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] "
"offset:29696 \n"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] "
"offset:27520 \n"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] "
"offset:31872 \n"
" s_waitcnt lgkmcnt(0) \n"
" s_barrier \n"
" ds_read_b32 v10, %[v_sfl_sld] offset:25344 \n"
" ds_read_b32 v11, %[v_sfl_sld] offset:25376 \n"
" ds_read_b32 v12, %[v_sfl_sld] offset:25408 \n"
" ds_read_b32 v13, %[v_sfl_sld] offset:25440 \n"
" ds_read_b32 v14, %[v_sfl_sld] offset:29696 \n"
" ds_read_b32 v15, %[v_sfl_sld] offset:29728 \n"
" ds_read_b32 v16, %[v_sfl_sld] offset:29760 \n"
" ds_read_b32 v17, %[v_sfl_sld] offset:29792 \n"
" s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, %[s_execflag_0] \n" _UK_ATOMIC_ADD_
" %[v_os_o0], v10, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_1] \n" _UK_ATOMIC_ADD_
" %[v_os_o1], v11, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_2] \n" _UK_ATOMIC_ADD_
" %[v_os_o2], v12, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_3] \n" _UK_ATOMIC_ADD_
" %[v_os_o3], v13, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_4] \n" _UK_ATOMIC_ADD_
" %[v_os_o4], v14, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_5] \n" _UK_ATOMIC_ADD_
" %[v_os_o5], v15, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_6] \n" _UK_ATOMIC_ADD_
" %[v_os_o6], v16, s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_7] \n" _UK_ATOMIC_ADD_
" %[v_os_o7], v17, s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
...@@ -61,15 +61,16 @@ struct FusedMoeGemmShape ...@@ -61,15 +61,16 @@ struct FusedMoeGemmShape
// TODO: we don't support half warps aound to 1 warp here // TODO: we don't support half warps aound to 1 warp here
static_assert(NumWarps == reduce_on_sequence(WarpPerBlock_1{}, multiplies{}, number<1>{})); static_assert(NumWarps == reduce_on_sequence(WarpPerBlock_1{}, multiplies{}, number<1>{}));
static constexpr index_t Block_M0 = BlockTile_0::at(number<0>{}); // S<32, 512, 128>, S<1, 4, 1>, S<16, 16, 32>
static constexpr index_t Block_N0 = BlockTile_0::at(number<1>{}); static constexpr index_t Block_M0 = BlockTile_0::at(number<0>{}); //32
static constexpr index_t Block_K0 = BlockTile_0::at(number<2>{}); static constexpr index_t Block_N0 = BlockTile_0::at(number<1>{}); //512
static constexpr index_t WarpPerBlock_M0 = WarpPerBlock_0::at(number<0>{}); static constexpr index_t Block_K0 = BlockTile_0::at(number<2>{}); // 128
static constexpr index_t WarpPerBlock_N0 = WarpPerBlock_0::at(number<1>{}); static constexpr index_t WarpPerBlock_M0 = WarpPerBlock_0::at(number<0>{}); // 1
static constexpr index_t WarpPerBlock_K0 = WarpPerBlock_0::at(number<2>{}); static constexpr index_t WarpPerBlock_N0 = WarpPerBlock_0::at(number<1>{}); // 4
static constexpr index_t Warp_M0 = WarpTile_0::at(number<0>{}); static constexpr index_t WarpPerBlock_K0 = WarpPerBlock_0::at(number<2>{}); // 1
static constexpr index_t Warp_N0 = WarpTile_0::at(number<1>{}); static constexpr index_t Warp_M0 = WarpTile_0::at(number<0>{}); // 16
static constexpr index_t Warp_K0 = WarpTile_0::at(number<2>{}); static constexpr index_t Warp_N0 = WarpTile_0::at(number<1>{}); // 16
static constexpr index_t Warp_K0 = WarpTile_0::at(number<2>{}); // 32
static constexpr index_t ThreadPerBlock_M0 = Warp_M0 * WarpPerBlock_M0; static constexpr index_t ThreadPerBlock_M0 = Warp_M0 * WarpPerBlock_M0;
static constexpr index_t ThreadPerBlock_N0 = Warp_N0 * WarpPerBlock_N0; static constexpr index_t ThreadPerBlock_N0 = Warp_N0 * WarpPerBlock_N0;
...@@ -77,19 +78,19 @@ struct FusedMoeGemmShape ...@@ -77,19 +78,19 @@ struct FusedMoeGemmShape
static_assert(Block_M0 % ThreadPerBlock_M0 == 0); static_assert(Block_M0 % ThreadPerBlock_M0 == 0);
static_assert(Block_N0 % ThreadPerBlock_N0 == 0); static_assert(Block_N0 % ThreadPerBlock_N0 == 0);
static_assert(Block_K0 % ThreadPerBlock_K0 == 0); static_assert(Block_K0 % ThreadPerBlock_K0 == 0);
static constexpr index_t Repeat_M0 = Block_M0 / ThreadPerBlock_M0; static constexpr index_t Repeat_M0 = Block_M0 / ThreadPerBlock_M0; // 2
static constexpr index_t Repeat_N0 = Block_N0 / ThreadPerBlock_N0; static constexpr index_t Repeat_N0 = Block_N0 / ThreadPerBlock_N0; // 8
static constexpr index_t Repeat_K0 = Block_K0 / ThreadPerBlock_K0; static constexpr index_t Repeat_K0 = Block_K0 / ThreadPerBlock_K0; // 4
static constexpr index_t Block_M1 = BlockTile_1::at(number<0>{}); static constexpr index_t Block_M1 = BlockTile_1::at(number<0>{}); //32
static constexpr index_t Block_N1 = BlockTile_1::at(number<1>{}); static constexpr index_t Block_N1 = BlockTile_1::at(number<1>{}); //128
static constexpr index_t Block_K1 = BlockTile_1::at(number<2>{}); static constexpr index_t Block_K1 = BlockTile_1::at(number<2>{}); //512
static constexpr index_t WarpPerBlock_M1 = WarpPerBlock_1::at(number<0>{}); static constexpr index_t WarpPerBlock_M1 = WarpPerBlock_1::at(number<0>{}); // 1
static constexpr index_t WarpPerBlock_N1 = WarpPerBlock_1::at(number<1>{}); static constexpr index_t WarpPerBlock_N1 = WarpPerBlock_1::at(number<1>{}); // 4
static constexpr index_t WarpPerBlock_K1 = WarpPerBlock_1::at(number<2>{}); static constexpr index_t WarpPerBlock_K1 = WarpPerBlock_1::at(number<2>{}); // 1
static constexpr index_t Warp_M1 = WarpTile_1::at(number<0>{}); static constexpr index_t Warp_M1 = WarpTile_1::at(number<0>{}); // 16
static constexpr index_t Warp_N1 = WarpTile_1::at(number<1>{}); static constexpr index_t Warp_N1 = WarpTile_1::at(number<1>{}); // 16
static constexpr index_t Warp_K1 = WarpTile_1::at(number<2>{}); static constexpr index_t Warp_K1 = WarpTile_1::at(number<2>{}); // 32
static constexpr index_t ThreadPerBlock_M1 = Warp_M1 * WarpPerBlock_M1; static constexpr index_t ThreadPerBlock_M1 = Warp_M1 * WarpPerBlock_M1;
static constexpr index_t ThreadPerBlock_N1 = Warp_N1 * WarpPerBlock_N1; static constexpr index_t ThreadPerBlock_N1 = Warp_N1 * WarpPerBlock_N1;
...@@ -97,9 +98,9 @@ struct FusedMoeGemmShape ...@@ -97,9 +98,9 @@ struct FusedMoeGemmShape
static_assert(Block_M1 % ThreadPerBlock_M1 == 0); static_assert(Block_M1 % ThreadPerBlock_M1 == 0);
static_assert(Block_N1 % ThreadPerBlock_N1 == 0); static_assert(Block_N1 % ThreadPerBlock_N1 == 0);
static_assert(Block_K1 % ThreadPerBlock_K1 == 0); static_assert(Block_K1 % ThreadPerBlock_K1 == 0);
static constexpr index_t Repeat_M1 = Block_M1 / ThreadPerBlock_M1; static constexpr index_t Repeat_M1 = Block_M1 / ThreadPerBlock_M1; // 2
static constexpr index_t Repeat_N1 = Block_N1 / ThreadPerBlock_N1; static constexpr index_t Repeat_N1 = Block_N1 / ThreadPerBlock_N1; // 2
static constexpr index_t Repeat_K1 = Block_K1 / ThreadPerBlock_K1; static constexpr index_t Repeat_K1 = Block_K1 / ThreadPerBlock_K1; // 16
static constexpr index_t BlockSize = warpSize * NumWarps; static constexpr index_t BlockSize = warpSize * NumWarps;
...@@ -115,9 +116,9 @@ struct FusedMoeGemmShape ...@@ -115,9 +116,9 @@ struct FusedMoeGemmShape
static constexpr index_t Block_W0 = Warp_N0 * Warp_K0; static constexpr index_t Block_W0 = Warp_N0 * Warp_K0;
static constexpr index_t Block_Nr0 = Block_N0 / Warp_N0; static constexpr index_t Block_Nr0 = Block_N0 / Warp_N0;
static constexpr index_t Block_Kr0 = Block_K0 / Warp_K0; static constexpr index_t Block_Kr0 = Block_K0 / Warp_K0;
static constexpr index_t Block_W1 = Warp_N1 * Warp_K1; static constexpr index_t Block_W1 = Warp_N1 * Warp_K1; // 512
static constexpr index_t Block_Nr1 = Block_N1 / Warp_N1; static constexpr index_t Block_Nr1 = Block_N1 / Warp_N1; // 8
static constexpr index_t Block_Kr1 = Block_K1 / Warp_K1; static constexpr index_t Block_Kr1 = Block_K1 / Warp_K1; // 16
static_assert(Block_W0 == Block_W1); static_assert(Block_W0 == Block_W1);
// static_assert(Block_Nr0 == Block_Kr1); // static_assert(Block_Nr0 == Block_Kr1);
......
...@@ -199,6 +199,8 @@ struct FusedMoeGemmPipeline_FlatmmUk ...@@ -199,6 +199,8 @@ struct FusedMoeGemmPipeline_FlatmmUk
threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA; threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA;
}, },
number<row_ids_a.size()>{}); number<row_ids_a.size()>{});
if (row_ids_a[0] >= kargs.num_tokens)
return;
auto a_res = auto a_res =
make_wave_buffer_resource(reinterpret_cast<const ADataType*>(kargs.a_ptr), make_wave_buffer_resource(reinterpret_cast<const ADataType*>(kargs.a_ptr),
kargs.num_tokens * kargs.stride_token * sizeof(ADataType)); kargs.num_tokens * kargs.stride_token * sizeof(ADataType));
...@@ -238,7 +240,7 @@ struct FusedMoeGemmPipeline_FlatmmUk ...@@ -238,7 +240,7 @@ struct FusedMoeGemmPipeline_FlatmmUk
const auto d_view_ = make_naive_tensor_view<address_space_enum::global>( const auto d_view_ = make_naive_tensor_view<address_space_enum::global>(
d_ptr, d_ptr,
make_tuple(nr_1, kr_1, BlockShape::Block_W1), make_tuple(nr_1, kr_1, BlockShape::Block_W1), // n/16, k/32, 512
make_tuple(kr_1 * BlockShape::Block_W1, BlockShape::Block_W1, 1), make_tuple(kr_1 * BlockShape::Block_W1, BlockShape::Block_W1, 1),
number<kAlignmentD>{}, number<kAlignmentD>{},
number<1>{}); number<1>{});
...@@ -264,13 +266,13 @@ struct FusedMoeGemmPipeline_FlatmmUk ...@@ -264,13 +266,13 @@ struct FusedMoeGemmPipeline_FlatmmUk
auto d_coords = [&]() { auto d_coords = [&]() {
constexpr index_t Nr_ = 2; constexpr index_t Nr_ = 2;
constexpr index_t Nw_ = 4; constexpr index_t Nw_ = 4;
constexpr index_t Kr0_ = 4; constexpr index_t Kr0_ = BlockShape::Block_Kr1 / Kr1_; //4
constexpr index_t Kr1_ = 4; constexpr index_t Kr1_ = 4;
constexpr index_t Kl_ = 4; constexpr index_t Kl_ = 4;
constexpr index_t Nl_ = 16; constexpr index_t Nl_ = 16;
constexpr index_t Kv_ = 8; constexpr index_t Kv_ = 8;
constexpr index_t W_ = Kl_ * Nl_ * Kv_; constexpr index_t W_ = Kl_ * Nl_ * Kv_; // 512
constexpr index_t num_offsets_ = Nr_ * Kr0_; constexpr index_t num_offsets_ = Nr_ * Kr0_; // 8
index_t base_os_ = (threadIdx.x % 64) * Kv_ + (threadIdx.x / 64) * index_t base_os_ = (threadIdx.x % 64) * Kv_ + (threadIdx.x / 64) *
shared_intermediate_size_1 * shared_intermediate_size_1 *
Nl_; // Kr0_ * Kr1_ * W_; Nl_; // Kr0_ * Kr1_ * W_;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment