Commit 811b75d3 authored by shengnxu's avatar shengnxu
Browse files

staging code for backup

parent 1c9a5ff4
......@@ -28,6 +28,12 @@ float fused_moegemm(fused_moegemm_traits t, fused_moegemm_args a, const ck_tile:
using t_ = fmoe_<ck_tile::fp16_t, ck_tile::fp16_t, ck_tile::fp16_t, float, float, float, float, S<32, 512, 128, 128>, S<1, 4, 1>, S<16, 16, 32>, 1, 0>;
r = fused_moegemm_<t_>(s, a);
}
else if(t.prec_i == "int8" && t.prec_w == "int8" && t.prec_o == "bf16" && t.prec_st == "fp32" &&
t.prec_sw == "fp32" && t.prec_sq == "fp32" && t.prec_kw == "fp32" && t.block_m == 32 && t.gate_only == 1)
{
using t_ = fmoe_<ck_tile::int8_t, ck_tile::int8_t, ck_tile::bf16_t, float, float, float, float, S<32, 512, 256, 256>, S<1, 4, 1>, S<16, 16, 64>, 1, 1>;
r = fused_moegemm_<t_>(s, a);
}
// clang-format on
return r;
}
......@@ -38,7 +38,8 @@ float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a)
f_traits>;
// using f_pipeline = ck_tile::FusedMoeGemmPipeline_FlatmmEx<f_problem>;
using f_pipeline = ck_tile::FusedMoeGemmPipeline_FlatmmUk<f_problem>;
using f_pipeline = ck_tile::FusedMoeGemmPipeline_FlatmmUk_int8<
>;
using f_partitioner = ck_tile::FusedMoeGemmTilePartitioner_Linear<f_shape>;
using f_kernel = ck_tile::FusedMoeGemmKernel<f_partitioner, f_pipeline, void>;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
namespace ck_tile {
// A async load to LDS, B direct to AGPR
// B matrix preshuffled in br*kr*w
// require 4 wave, occupancy=1c
// agpr useage:256
// vgpr usage:64(A local) + 64(acc) + 8(os_a) + 8(os_b) = 144 (rem:112)
//
// for this gemm, 4 16x16x16 transposed layout
// input A vpgpr layout
// v0-v15: [ 0:15](gemm_m)x128(gemm_k)
// v16-v31: [16:31](gemm_m)x128(gemm_k)
// input B vpgpr layout
// v0-v15: [ 0: 15](gemm_n)x128(gemm_k)
// v16-v31: [ 64: 79](gemm_n)x128(gemm_k)
// ......................
// v111-v127: [448:463](gemm_n)x128(gemm_k)
// output C vpgpr layout
// v0-v3 : [ 0:15](gemm_m)x[ 0: 15](gemm_n)
// v4-v7 : [16:31](gemm_m)x[ 0: 15](gemm_n)
// v8-v11: [ 0:15](gemm_m)x[64: 79](gemm_n)
// v12-v15: [16:31](gemm_m)x[64: 79](gemm_n)
// ......................
// v56-v59: [ 0:15](gemm_m)x[448:463](gemm_n)
// v60-v63: [16:31](gemm_m)x[448:463](gemm_n)
struct Flatmm_32x512x256_1x4x1_16x16x64_Base // for int8/fp8
{
static constexpr index_t Block_M = 32;
static constexpr index_t Block_N = 512;
static constexpr index_t Block_K = 258;
static constexpr index_t WarpPerBlock_M = 1;
static constexpr index_t WarpPerBlock_N = 4;
static constexpr index_t WarpPerBlock_K = 1;
static constexpr index_t NumWarps = 4;
static constexpr index_t Warp_M = 16;
static constexpr index_t Warp_N = 16;
static constexpr index_t Warp_K = 64; // 32 * SubKPacks
static constexpr index_t BlockSize = 256;
static constexpr index_t SubKPacks = 4; // this is used to gurantee every threads can do dwordx4
// TODO: note Nr/Kr/W need consider SubKPacks
static constexpr index_t Block_W = Warp_N * Warp_K; // 1024 element
static constexpr index_t Block_Nr = Block_N / Warp_N; // 32 element, 4 per wave
static constexpr index_t Block_Kr = Block_K / Warp_K; // 4
static constexpr index_t Repeat_M = Block_M / (Warp_M * WarpPerBlock_M); // 2
static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 8
static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 8/2=4
static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
{
constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
sequence<>,
tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_N, WarpPerBlock_N>>,
tuple<sequence<1, 2>>,
tuple<sequence<1, 1>>,
sequence<2, 1>, // !! note here is different
sequence<0, 0>>{};
using WG = WarpGemmMfma_i32_16x16x64_int8_int8_CTransposed;
constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
return c_block_dstr;
}
static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
{
using CDataType = float;
constexpr auto c_block_dstr = MakeCBlockDist();
auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
return c_block_tensor;
}
CK_TILE_HOST_DEVICE static constexpr auto MakeLdsStoreDesc_A()
{
// A async->LDS
// constexpr index_t Block_M = Problem::BlockShape::Block_M0;
// constexpr index_t Block_K = Problem::BlockShape::Block_K0;
// constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
constexpr index_t warpSize = ck_tile::get_warp_size();
// constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
constexpr index_t KPack_ = 16; // GetSmemKPack_A<Problem>(); // LDS
constexpr index_t KVector = 4; // GetAlignment_A<Problem>(); // async copy 1 dword
constexpr index_t KPad = KPack_; // pad between warps
static_assert(Block_K % KVector == 0);
constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
if constexpr(LanesPerK >= warpSize)
{
// need multiple waves to load K
static_assert(LanesPerK % warpSize == 0);
constexpr index_t wavesPerK = LanesPerK / warpSize;
if constexpr(wavesPerK > NumWarps)
{
// TODO: need multiple issues along K to load all data
}
else
{
constexpr index_t wavesPerM = NumWarps / wavesPerK;
constexpr index_t NumIssues = Block_M / wavesPerM;
constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
make_tuple(number<NumIssues>{}, // m0
number<wavesPerM>{}, // m1
number<wavesPerK>{}, // k0
number<warpSize>{}, // k1
number<KVector>{}), // k2
make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{}, // m0
number<wavesPerK*(warpSize * KVector + KPad)>{}, // m1
number<warpSize * KVector + KPad>{}, // k0
number<KVector>{}, // k1
number<1>{}), // k2
number<KVector>{}, // lds store vector(actually no explicit store)
number<1>{});
constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor(
lds_block_desc_0,
make_tuple(
make_pass_through_transform(number<NumIssues>{}),
make_merge_transform(make_tuple(number<wavesPerM>{}, number<wavesPerK>{})),
make_merge_transform(make_tuple(number<warpSize>{}, number<KVector>{}))),
make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
return lds_block_desc_issues_warps_lanes;
}
}
else
{
// lanes within a wave load different M but same K
static_assert(warpSize % LanesPerK == 0);
constexpr index_t LaneGroups = warpSize / LanesPerK; // along m
constexpr index_t NumIssues = Block_M / (LaneGroups * NumWarps);
constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
make_tuple(number<NumIssues>{}, // m0
number<LaneGroups>{}, // m1
number<NumWarps>{}, // m2
number<LanesPerK>{}, // k0
number<KVector>{}), // k1
make_tuple(number<NumWarps*(warpSize * KVector + KPad)>{}, // m0
number<Block_K>{}, // m1
number<warpSize * KVector + KPad>{}, // m2
number<KVector>{}, // k0
number<1>{}), // k1
number<KVector>{}, // lds store vector(actually no explicit store)
number<1>{});
constexpr auto lds_block_desc_issues_warps_lanes = transform_tensor_descriptor(
lds_block_desc_0,
make_tuple(make_pass_through_transform(number<NumIssues>{}),
make_pass_through_transform(number<NumWarps>{}),
make_merge_transform(make_tuple(
number<LaneGroups>{}, number<LanesPerK>{}, number<KVector>{}))),
make_tuple(sequence<0>{}, sequence<2>{}, sequence<1, 3, 4>{}),
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
return lds_block_desc_issues_warps_lanes;
}
}
// template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A()
{
// load from LDS to register, every wave has same layout
constexpr index_t KPack_ = 16; // GetSmemKPack_A<Problem>(); // LDS
constexpr index_t KPad = KPack_; // pad between warps
constexpr index_t kAMLane = 16;
constexpr index_t kABKLane = 4;
constexpr index_t kABKPerLane = 8;
constexpr index_t kKIter = 2;
static_assert(KPack_ == (kABKPerLane * kKIter));
constexpr auto lds_block_desc_0 =
make_naive_tensor_descriptor(make_tuple(number<Repeat_M>{}, // m0 y
number<kAMLane>{}, // m1 p
number<Repeat_K>{}, // k0 y
number<kABKLane>{}, // k1 p
number<KPack_>{}), // k2 y-vector
make_tuple(number<kAMLane*(Block_K + KPad)>{}, // m0
number<Block_K + KPad>{}, // m1
number<kABKLane * KPack_>{}, // k0
number<KPack_>{}, // k1
number<1>{}), // k2
number<KPack_>{}, // lds load vector
number<1>{});
constexpr auto lds_desc_m_k = transform_tensor_descriptor(
lds_block_desc_0,
make_tuple(make_merge_transform(make_tuple(number<Repeat_M>{}, number<kAMLane>{})),
make_merge_transform(
make_tuple(number<Repeat_K>{}, number<kABKLane>{}, number<KPack_>{}))),
make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
make_tuple(sequence<0>{}, sequence<1>{}));
return lds_desc_m_k;
}
static constexpr auto GetGemm_AWarpEnc()
{
constexpr index_t kAMLane = 16;
constexpr index_t kABKLane = 4;
constexpr index_t kABKPerLane = 8;
constexpr index_t kKIter = 2;
using enc_ = tile_distribution_encoding<
sequence<>,
tuple<sequence<kAMLane>, sequence<kABKLane, kABKPerLane * kKIter>>,
tuple<sequence<2, 1>>,
tuple<sequence<0, 0>>,
sequence<2>,
sequence<1>>;
return enc_{};
}
CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
{
return 32 * (256 + 16) * sizeof(int8_t);
}
};
struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16x16x64_Base
{
using ADataType = int8_t;
using BDataType = int8_t;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
template <typename DQRes, typename GQRes,typename ARes, typename ACoords, typename BRes, typename BCoords>
CK_TILE_DEVICE auto
operator()( index_t row_ids_a_,
const DQes& res_aq
const DQes& res_dq,
const GQRes& res_gq,
const Res& res_a,
const ACoords& cached_coords_a,
const BRes& res_b,
const BCoords& cached_coords_b,
CK_TILE_LDS_ADDR void* smem,
index_t k,
index_t tile_offset_a, // for each tile, the offset to move for each unroll
index_t tile_offset_b) // for each tile, the offset to move for each unroll
{
static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 4 /*2x per dword*/); // 8
static_assert(BCoords::size() == Repeat_N);
auto a_sst = make_tile_window(
make_tensor_view<address_space_enum::lds>(
reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsStoreDesc_A()),
MakeLdsStoreDesc_A().get_lengths(),
{0, 0, 0});
auto a_sld = [&]() {
constexpr auto a_warp_enc_ = GetGemm_AWarpEnc();
constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
sequence<WarpPerBlock_N>,
tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_K>>,
tuple<sequence<1, 0>>,
tuple<sequence<1, 0>>,
sequence<1, 2>,
sequence<0, 0>>{};
constexpr auto a_block_dstr_encode =
detail::make_embed_tile_distribution_encoding(a_outer_dstr_enc, a_warp_enc_);
return make_tile_window_linear(
make_tensor_view<address_space_enum::lds>(
reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsLoadDesc_A()),
MakeLdsLoadDesc_A().get_lengths(),
{0, 0},
make_static_tile_distribution(a_block_dstr_encode));
}();
const index_t tile_offset_a_bytes = tile_offset_a * sizeof(ADataType);
const index_t tile_offset_b_bytes = tile_offset_b * sizeof(BDataType);
const auto [m0_init_value, size_per_issue] = get_async_store_smem_info(a_sst);
constexpr auto smem_buf_size =
MakeLdsLoadDesc_A().get_element_space_size() * sizeof(ADataType);
static_assert(a_sld.get_num_of_access() == 8);
constexpr auto sld_os = generate_tuple(
[&](auto i_access) {
return number<a_sld.get_bottom_linear_offset(i_access) * sizeof(ADataType)>{};
},
number<a_sld.get_num_of_access()>{});
index_t loop_cnt = k / Block_K;
// this is the acc thread buffer
register int v_z0 asm("v128") = 0;
register int v_z1 asm("v129") = 0;
register int v_z2 asm("v130") = 0;
register int v_z3 asm("v131") = 0;
register int v_z4 asm("v132") = 0;
register int v_z5 asm("v133") = 0;
register int v_z6 asm("v134") = 0;
register int v_z7 asm("v135") = 0;
register int v_z8 asm("v136") = 0;
register int v_z9 asm("v137") = 0;
register int v_z10 asm("v138") = 0;
register int v_z11 asm("v139") = 0;
register int v_z12 asm("v140") = 0;
register int v_z13 asm("v141") = 0;
register int v_z14 asm("v142") = 0;
register int v_z15 asm("v143") = 0;
register int v_z16 asm("v144") = 0;
register int v_z17 asm("v145") = 0;
register int v_z18 asm("v146") = 0;
register int v_z19 asm("v147") = 0;
register int v_z20 asm("v148") = 0;
register int v_z21 asm("v149") = 0;
register int v_z22 asm("v150") = 0;
register int v_z23 asm("v151") = 0;
register int v_z24 asm("v152") = 0;
register int v_z25 asm("v153") = 0;
register int v_z26 asm("v154") = 0;
register int v_z27 asm("v155") = 0;
register int v_z28 asm("v156") = 0;
register int v_z29 asm("v157") = 0;
register int v_z30 asm("v158") = 0;
register int v_z31 asm("v159") = 0;
register int v_z32 asm("v160") = 0;
register int v_z33 asm("v161") = 0;
register int v_z34 asm("v162") = 0;
register int v_z35 asm("v163") = 0;
register int v_z36 asm("v164") = 0;
register int v_z37 asm("v165") = 0;
register int v_z38 asm("v166") = 0;
register int v_z39 asm("v167") = 0;
register int v_z40 asm("v168") = 0;
register int v_z41 asm("v169") = 0;
register int v_z42 asm("v170") = 0;
register int v_z43 asm("v171") = 0;
register int v_z44 asm("v172") = 0;
register int v_z45 asm("v173") = 0;
register int v_z46 asm("v174") = 0;
register int v_z47 asm("v175") = 0;
register int v_z48 asm("v176") = 0;
register int v_z49 asm("v177") = 0;
register int v_z50 asm("v178") = 0;
register int v_z51 asm("v179") = 0;
register int v_z52 asm("v180") = 0;
register int v_z53 asm("v181") = 0;
register int v_z54 asm("v182") = 0;
register int v_z55 asm("v183") = 0;
register int v_z56 asm("v184") = 0;
register int v_z57 asm("v185") = 0;
register int v_z58 asm("v186") = 0;
register int v_z59 asm("v187") = 0;
register int v_z60 asm("v188") = 0;
register int v_z61 asm("v189") = 0;
register int v_z62 asm("v190") = 0;
register int v_z63 asm("v191") = 0;
// B nr->kr
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
// clang-format off
asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
#include "uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc"
#undef CK_TILE_FLATMM_UK_MFMA
: [s_loop_cnt]"+s"(loop_cnt),
// [v_acc_0]"+v"(v_acc[0]),
// [v_acc_1]"+v"(v_acc[1]),
// [v_acc_2]"+v"(v_acc[2]),
// [v_acc_3]"+v"(v_acc[3]),
// [v_acc_4]"+v"(v_acc[4]),
// [v_acc_5]"+v"(v_acc[5]),
// [v_acc_6]"+v"(v_acc[6]),
// [v_acc_7]"+v"(v_acc[7]),
// [v_acc_8]"+v"(v_acc[8]),
// [v_acc_9]"+v"(v_acc[9]),
// [v_acc_10]"+v"(v_acc[10]),
// [v_acc_11]"+v"(v_acc[11]),
// [v_acc_12]"+v"(v_acc[12]),
// [v_acc_13]"+v"(v_acc[13]),
// [v_acc_14]"+v"(v_acc[14]),
// [v_acc_15]"+v"(v_acc[15]),
[v_token_id]"+v"(row_ids_a_),
[s_mem_]"+r"(smem)
: [s_res_dq0]"s"(res_dq[0]),
[s_res_dq1]"s"(res_dq[1]),
[s_res_dq2]"s"(res_dq[2]),
[s_res_dq3]"s"(res_dq[3]),
[s_res_gq0]"s"(res_gq[0]),
[s_res_gq1]"s"(res_gq[1]),
[s_res_gq2]"s"(res_gq[2]),
[s_res_gq3]"s"(res_gq[3]),
[s_res_a0]"s"(res_a[0]),
[s_res_a1]"s"(res_a[1]),
[s_res_a2]"s"(res_a[2]),
[s_res_a3]"s"(res_a[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))),
[v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))),
[v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))),
[v_os_a3]"v"(static_cast<index_t>(cached_coords_a[number<3>{}] * sizeof(ADataType))),
[v_os_a4]"v"(static_cast<index_t>(cached_coords_a[number<4>{}] * sizeof(ADataType))),
[v_os_a5]"v"(static_cast<index_t>(cached_coords_a[number<5>{}] * sizeof(ADataType))),
[v_os_a6]"v"(static_cast<index_t>(cached_coords_a[number<6>{}] * sizeof(ADataType))),
[v_os_a7]"v"(static_cast<index_t>(cached_coords_a[number<7>{}] * sizeof(ADataType))),
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[v_os_slda]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))),
[s_m0_init]"s"(m0_init_value),
[s_size_per_issue]"s"(size_per_issue),
[smem_sz]"n"(smem_buf_size), //(smem_buf_size),
[sld_os_0]"n"(sld_os[number<0>{}].value),
[sld_os_1]"n"(sld_os[number<1>{}].value),
[sld_os_2]"n"(sld_os[number<2>{}].value),
[sld_os_3]"n"(sld_os[number<3>{}].value),
[sld_os_4]"n"(sld_os[number<4>{}].value),
[sld_os_5]"n"(sld_os[number<5>{}].value),
[sld_os_6]"n"(sld_os[number<6>{}].value),
[sld_os_7]"n"(sld_os[number<7>{}].value),
[s_tile_os_a]"s"(tile_offset_a_bytes),
[s_tile_os_b]"s"(tile_offset_b_bytes)
: "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
"s86", // s86 as tmp
"v64", "v65", "v66", "v67", "v68", "v69",
"v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79",
"v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89",
"v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99",
"v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
"v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115",
"v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123",
"v124", "v125", "v126", "v127"
);
// clang-format on
#pragma clang diagnostic pop
int32x4_t v_acc[16]{0};
v_acc[0][0] = v_z128;
v_acc[0][1] = v_z129;
v_acc[0][2] = v_z130;
v_acc[0][3] = v_z131;
v_acc[1][0] = v_z132;
v_acc[1][1] = v_z133;
v_acc[1][2] = v_z134;
v_acc[1][3] = v_z135;
v_acc[2][0] = v_z136;
v_acc[2][1] = v_z137;
v_acc[2][2] = v_z138;
v_acc[2][3] = v_z139;
v_acc[3][0] = v_z140;
v_acc[3][1] = v_z141;
v_acc[3][2] = v_z142;
v_acc[3][3] = v_z143;
v_acc[4][0] = v_z144;
v_acc[4][1] = v_z145;
v_acc[4][2] = v_z146;
v_acc[4][3] = v_z147;
v_acc[5][0] = v_z148;
v_acc[5][1] = v_z149;
v_acc[5][2] = v_z150;
v_acc[5][3] = v_z151;
v_acc[6][0] = v_z152;
v_acc[6][1] = v_z153;
v_acc[6][2] = v_z154;
v_acc[6][3] = v_z155;
v_acc[7][0] = v_z156;
v_acc[7][1] = v_z157;
v_acc[7][2] = v_z158;
v_acc[7][3] = v_z159;
v_acc[8][0] = v_z160;
v_acc[8][1] = v_z161;
v_acc[8][2] = v_z162;
v_acc[8][3] = v_z163;
v_acc[9][0] = v_z164;
v_acc[9][1] = v_z165;
v_acc[9][2] = v_z166;
v_acc[9][3] = v_z167;
v_acc[10][0] = v_z168;
v_acc[10][1] = v_z169;
v_acc[10][2] = v_z170;
v_acc[10][3] = v_z171;
v_acc[11][0] = v_z172;
v_acc[11][1] = v_z173;
v_acc[11][2] = v_z174;
v_acc[11][3] = v_z175;
v_acc[12][0] = v_z176;
v_acc[12][1] = v_z177;
v_acc[12][2] = v_z178;
v_acc[12][3] = v_z179;
v_acc[13][0] = v_z180;
v_acc[13][1] = v_z181;
v_acc[13][2] = v_z182;
v_acc[13][3] = v_z183;
v_acc[14][0] = v_z184;
v_acc[14][1] = v_z185;
v_acc[14][2] = v_z186;
v_acc[14][3] = v_z187;
v_acc[15][0] = v_z188;
v_acc[15][1] = v_z189;
v_acc[15][2] = v_z190;
v_acc[15][3] = v_z191;
// return local scratch
auto c = MakeCBlockTile();
for(auto i = 0; i < 16; i++)
{
c.get_thread_buffer()[4 * i + 0] = v_acc[i].x;
c.get_thread_buffer()[4 * i + 1] = v_acc[i].y;
c.get_thread_buffer()[4 * i + 2] = v_acc[i].z;
c.get_thread_buffer()[4 * i + 3] = v_acc[i].w;
}
return c;
}
};
struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base
{
using ADataType = fp16_t;
using BDataType = fp16_t;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
template <typename ARes, typename ACoords, typename BRes, typename BCoords>
CK_TILE_DEVICE auto
operator()(const ARes& res_a,
const ACoords& cached_coords_a,
const BRes& res_b,
const BCoords& cached_coords_b,
CK_TILE_LDS_ADDR void* smem,
index_t k,
index_t tile_offset_a, // for each tile, the offset to move for each unroll
index_t tile_offset_b) // for each tile, the offset to move for each unroll
{
static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8
static_assert(BCoords::size() == Repeat_N);
auto a_sst = make_tile_window(
make_tensor_view<address_space_enum::lds>(
reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsStoreDesc_A()),
MakeLdsStoreDesc_A().get_lengths(),
{0, 0, 0});
auto a_sld = [&]() {
constexpr auto a_warp_enc_ = GetGemm_AWarpEnc();
constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
sequence<WarpPerBlock_N>,
tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_K>>,
tuple<sequence<1, 0>>,
tuple<sequence<1, 0>>,
sequence<1, 2>,
sequence<0, 0>>{};
constexpr auto a_block_dstr_encode =
detail::make_embed_tile_distribution_encoding(a_outer_dstr_enc, a_warp_enc_);
return make_tile_window_linear(
make_tensor_view<address_space_enum::lds>(
reinterpret_cast<CK_TILE_LDS_ADDR ADataType*>(smem), MakeLdsLoadDesc_A()),
MakeLdsLoadDesc_A().get_lengths(),
{0, 0},
make_static_tile_distribution(a_block_dstr_encode));
}();
const index_t tile_offset_a_bytes = tile_offset_a * sizeof(ADataType);
const index_t tile_offset_b_bytes = tile_offset_b * sizeof(BDataType);
const auto [m0_init_value, size_per_issue] = get_async_store_smem_info(a_sst);
constexpr auto smem_buf_size =
MakeLdsLoadDesc_A().get_element_space_size() * sizeof(ADataType);
static_assert(a_sld.get_num_of_access() == 8);
constexpr auto sld_os = generate_tuple(
[&](auto i_access) {
return number<a_sld.get_bottom_linear_offset(i_access) * sizeof(ADataType)>{};
},
number<a_sld.get_num_of_access()>{});
index_t loop_cnt = k / Block_K;
// this is the acc thread buffer
fp32x4_t v_acc[16]{.0f};
// B nr->kr
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
// clang-format off
asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA
: [s_loop_cnt]"+s"(loop_cnt),
[v_acc_0]"+v"(v_acc[0]),
[v_acc_1]"+v"(v_acc[1]),
[v_acc_2]"+v"(v_acc[2]),
[v_acc_3]"+v"(v_acc[3]),
[v_acc_4]"+v"(v_acc[4]),
[v_acc_5]"+v"(v_acc[5]),
[v_acc_6]"+v"(v_acc[6]),
[v_acc_7]"+v"(v_acc[7]),
[v_acc_8]"+v"(v_acc[8]),
[v_acc_9]"+v"(v_acc[9]),
[v_acc_10]"+v"(v_acc[10]),
[v_acc_11]"+v"(v_acc[11]),
[v_acc_12]"+v"(v_acc[12]),
[v_acc_13]"+v"(v_acc[13]),
[v_acc_14]"+v"(v_acc[14]),
[v_acc_15]"+v"(v_acc[15]),
[s_mem_]"+r"(smem)
: [s_res_a0]"s"(res_a[0]),
[s_res_a1]"s"(res_a[1]),
[s_res_a2]"s"(res_a[2]),
[s_res_a3]"s"(res_a[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))),
[v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))),
[v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))),
[v_os_a3]"v"(static_cast<index_t>(cached_coords_a[number<3>{}] * sizeof(ADataType))),
[v_os_a4]"v"(static_cast<index_t>(cached_coords_a[number<4>{}] * sizeof(ADataType))),
[v_os_a5]"v"(static_cast<index_t>(cached_coords_a[number<5>{}] * sizeof(ADataType))),
[v_os_a6]"v"(static_cast<index_t>(cached_coords_a[number<6>{}] * sizeof(ADataType))),
[v_os_a7]"v"(static_cast<index_t>(cached_coords_a[number<7>{}] * sizeof(ADataType))),
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[v_os_slda]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))),
[s_m0_init]"s"(m0_init_value),
[s_size_per_issue]"s"(size_per_issue),
[smem_sz]"n"(smem_buf_size), //(smem_buf_size),
[sld_os_0]"n"(sld_os[number<0>{}].value),
[sld_os_1]"n"(sld_os[number<1>{}].value),
[sld_os_2]"n"(sld_os[number<2>{}].value),
[sld_os_3]"n"(sld_os[number<3>{}].value),
[sld_os_4]"n"(sld_os[number<4>{}].value),
[sld_os_5]"n"(sld_os[number<5>{}].value),
[sld_os_6]"n"(sld_os[number<6>{}].value),
[sld_os_7]"n"(sld_os[number<7>{}].value),
[s_tile_os_a]"s"(tile_offset_a_bytes),
[s_tile_os_b]"s"(tile_offset_b_bytes)
: "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
"s86", // s86 as tmp
"v64", "v65", "v66", "v67", "v68", "v69",
"v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79",
"v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89",
"v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99",
"v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
"v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115",
"v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123",
"v124", "v125", "v126", "v127"
);
// clang-format on
#pragma clang diagnostic pop
// return local scratch
auto c = MakeCBlockTile();
for(auto i = 0; i < 16; i++)
{
c.get_thread_buffer()[4 * i + 0] = v_acc[i].x;
c.get_thread_buffer()[4 * i + 1] = v_acc[i].y;
c.get_thread_buffer()[4 * i + 2] = v_acc[i].z;
c.get_thread_buffer()[4 * i + 3] = v_acc[i].w;
}
return c;
}
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
namespace ck_tile {
// "S"tream update output along "N"
// A in smem, B load from global
// require 4 wave, occupancy=1c
struct FlatmmSn_32x256x512_1x4x1_16x16x64_Base
{
static constexpr index_t Block_M = 32;
static constexpr index_t Block_N = 256;
static constexpr index_t Block_K = 512;
static constexpr index_t WarpPerBlock_M = 1;
static constexpr index_t WarpPerBlock_N = 4;
static constexpr index_t WarpPerBlock_K = 1;
static constexpr index_t Warp_M = 16;
static constexpr index_t Warp_N = 16;
static constexpr index_t Warp_K = 64;
static constexpr index_t BlockSize = 256;
// static constexpr index_t KPack = 2; // this is used to gurantee every threads can do dwordx4
// TODO: note Nr/Kr/W need consider KPack
static constexpr index_t Block_W = Warp_N * Warp_K; // 512 element
static constexpr index_t Block_Nr = Block_N / Warp_N; // 32 element, 4 per wave
static constexpr index_t Block_Kr = Block_K / Warp_K; // 4
static constexpr index_t Repeat_M = Block_M / (Warp_M * WarpPerBlock_M); // 2
static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 2
static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 16
static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
{
constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
sequence<>,
tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_N, WarpPerBlock_N>>,
tuple<sequence<1, 2>>,
tuple<sequence<1, 1>>,
sequence<2, 1>, // !! note here is different
sequence<0, 0>>{};
using WG = WarpGemmMfma_i32_16x16x64_int8_int8_CTransposed;
constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
return c_block_dstr;
}
CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
{
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
return 2 * 2 * 4 * 4 * (16 * 4 + 4) * sizeof(bf16_t);
}
};
struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x1_16x16x64_Base
{
using BDataType = int8_t;
using ODataType = int8_t;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template <typename BRes,
typename BCoords,
typename ORes,
typename OCoords,
typename OFlags,
typename ScaleTensor>
CK_TILE_DEVICE auto
operator()(const BRes& res_b,
const BCoords& cached_coords_b,
const ORes& res_o,
const OCoords& cached_coords_o,
const OFlags& o_flags, // this should be in sgpr
CK_TILE_LDS_ADDR void* smem,
index_t n, // loop along n dim
const ScaleTensor& scale_,
index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
index_t tile_offset_half_b, //splited load alone K in to 2 part
index_t tile_offset_o)
{
static_assert(BCoords::size() == 8); // 8
static_assert(OCoords::size() == 8);
const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
const index_t tile_offset_half_b_bytes = tile_offset_half_b * sizeof(BDataType);
const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
static_assert(ScaleTensor::size() == 2);
float s0 = scale_[number<0>{}];
float s1 = scale_[number<1>{}];
index_t loop_cnt = n / Block_N;
register float v_c0 asm("v64");
register float v_c1 asm("v65");
register float v_c2 asm("v66");
register float v_c3 asm("v67");
register float v_c4 asm("v68");
register float v_c5 asm("v69");
register float v_c6 asm("v70");
register float v_c7 asm("v71");
register float v_c8 asm("v72");
register float v_c9 asm("v73");
register float v_c10 asm("v74");
register float v_c11 asm("v75");
register float v_c12 asm("v76");
register float v_c13 asm("v77");
register float v_c14 asm("v78");
register float v_c15 asm("v79");
register float v_c16 asm("v80");
register float v_c17 asm("v81");
register float v_c18 asm("v82");
register float v_c19 asm("v83");
register float v_c20 asm("v84");
register float v_c21 asm("v85");
register float v_c22 asm("v86");
register float v_c23 asm("v87");
register float v_c24 asm("v88");
register float v_c25 asm("v89");
register float v_c26 asm("v90");
register float v_c27 asm("v91");
register float v_c28 asm("v92");
register float v_c29 asm("v93");
register float v_c30 asm("v94");
register float v_c31 asm("v95");
int32_t nan_hi = 0x7fff0000;
int32_t nan_lo = 0x00007fff;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int lane_id = threadIdx.x % 64;
int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
sld_y_os *= 2;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
sfl_sst *= 2;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
sfl_sld *= 2;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem),
[s_loop_cnt]"+s"(loop_cnt),
[c0]"+v" (v_c0),
[c1]"+v" (v_c1),
[c2]"+v" (v_c2),
[c3]"+v" (v_c3),
[c4]"+v" (v_c4),
[c5]"+v" (v_c5),
[c6]"+v" (v_c6),
[c7]"+v" (v_c7),
[c8]"+v" (v_c8),
[c9]"+v" (v_c9),
[c10]"+v"(v_c10),
[c11]"+v"(v_c11),
[c12]"+v"(v_c12),
[c13]"+v"(v_c13),
[c14]"+v"(v_c14),
[c15]"+v"(v_c15),
[c16]"+v"(v_c16),
[c17]"+v"(v_c17),
[c18]"+v"(v_c18),
[c19]"+v"(v_c19),
[c20]"+v"(v_c20),
[c21]"+v"(v_c21),
[c22]"+v"(v_c22),
[c23]"+v"(v_c23),
[c24]"+v"(v_c24),
[c25]"+v"(v_c25),
[c26]"+v"(v_c26),
[c27]"+v"(v_c27),
[c28]"+v"(v_c28),
[c29]"+v"(v_c29),
[c30]"+v"(v_c30),
[c31]"+v"(v_c31)
:
[sld_a_base]"n"(0),
[shfl_base]"n"(0),
[v_sld_y_os]"v"(sld_y_os),
[v_sfl_sld]"v"(sfl_sld),
[v_sfl_sst]"v"(sfl_sst),
[s_res_o0]"s"(res_o[0]),
[s_res_o1]"s"(res_o[1]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
[v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
[v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
[v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
[v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
[v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
[v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
[v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[s_tile_os_o]"s"(tile_stride_o_bytes),
[s_tile_os_b_half]"s"(tile_offset_half_b_bytes),
[s_tile_os_b]"s"(tile_stride_b_bytes),
[scale_0]"v"(s0),
[scale_1]"v"(s1),
[v_nan_lo]"v"(nan_lo),
[v_nan_hi]"v"(nan_hi),
[s_execflag_0]"s"(o_flags[number<0>{}]),
[s_execflag_1]"s"(o_flags[number<1>{}]),
[s_execflag_2]"s"(o_flags[number<2>{}]),
[s_execflag_3]"s"(o_flags[number<3>{}]),
[s_execflag_4]"s"(o_flags[number<4>{}]),
[s_execflag_5]"s"(o_flags[number<5>{}]),
[s_execflag_6]"s"(o_flags[number<6>{}]),
[s_execflag_7]"s"(o_flags[number<7>{}])
:
"memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255",
"s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
"s36", "s37",
"v50", "v54", "v55",
"v64","v65","v66","v67","v68","v69","v70","v71",
"v72","v73","v74","v75","v76","v77","v78","v79",
"v80","v81","v82","v83","v84","v85","v86","v87",
"v88","v89","v90","v91","v92","v93","v94","v95",
"v128", "v129", "v130", "v131",
"v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
"v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
"v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
"v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
"v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
"v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
"v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
"v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
"v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
"v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
"v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
"v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
"v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
"v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
"v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
"v252", "v253", "v254", "v255"
);
#pragma clang diagnostic pop
// clang-format on
}
};
struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x1_16x16x32_Base
{
using BDataType = bf16_t;
using ODataType = bf16_t;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template <typename BRes,
typename BCoords,
typename ORes,
typename OCoords,
typename OFlags,
typename ScaleTensor>
CK_TILE_DEVICE auto
operator()(const BRes& res_b,
const BCoords& cached_coords_b,
const ORes& res_o,
const OCoords& cached_coords_o,
const OFlags& o_flags, // this should be in sgpr
CK_TILE_LDS_ADDR void* smem,
index_t n, // loop along n dim
const ScaleTensor& scale_,
index_t tile_offset_b, // stride b is fixed to blockKr * blockW, but still can adjust
index_t tile_offset_o)
{
static_assert(BCoords::size() == 8); // 8
static_assert(OCoords::size() == 8);
const index_t tile_stride_b_bytes = tile_offset_b * sizeof(BDataType);
const index_t tile_stride_o_bytes = tile_offset_o * sizeof(ODataType);
static_assert(ScaleTensor::size() == 2);
float s0 = scale_[number<0>{}];
float s1 = scale_[number<1>{}];
index_t loop_cnt = n / Block_N;
register float v_c0 asm("v64");
register float v_c1 asm("v65");
register float v_c2 asm("v66");
register float v_c3 asm("v67");
register float v_c4 asm("v68");
register float v_c5 asm("v69");
register float v_c6 asm("v70");
register float v_c7 asm("v71");
register float v_c8 asm("v72");
register float v_c9 asm("v73");
register float v_c10 asm("v74");
register float v_c11 asm("v75");
register float v_c12 asm("v76");
register float v_c13 asm("v77");
register float v_c14 asm("v78");
register float v_c15 asm("v79");
register float v_c16 asm("v80");
register float v_c17 asm("v81");
register float v_c18 asm("v82");
register float v_c19 asm("v83");
register float v_c20 asm("v84");
register float v_c21 asm("v85");
register float v_c22 asm("v86");
register float v_c23 asm("v87");
register float v_c24 asm("v88");
register float v_c25 asm("v89");
register float v_c26 asm("v90");
register float v_c27 asm("v91");
register float v_c28 asm("v92");
register float v_c29 asm("v93");
register float v_c30 asm("v94");
register float v_c31 asm("v95");
int32_t nan_hi = 0x7fff0000;
int32_t nan_lo = 0x00007fff;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int lane_id = threadIdx.x % 64;
int sld_y_os = (lane_id % 16) * 4 + (lane_id / 16) * 128;
sld_y_os *= 2;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int sfl_sst = (threadIdx.x % 16 * 4) + (threadIdx.x / 16) * (64 + 4);
sfl_sst *= 2;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int sfl_sld = (lane_id % 2) * 2 + (lane_id / 2) * (64 + 4) + (threadIdx.x / 64) * 4;
sfl_sld *= 2;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem),
[s_loop_cnt]"+s"(loop_cnt),
[c0]"+v" (v_c0),
[c1]"+v" (v_c1),
[c2]"+v" (v_c2),
[c3]"+v" (v_c3),
[c4]"+v" (v_c4),
[c5]"+v" (v_c5),
[c6]"+v" (v_c6),
[c7]"+v" (v_c7),
[c8]"+v" (v_c8),
[c9]"+v" (v_c9),
[c10]"+v"(v_c10),
[c11]"+v"(v_c11),
[c12]"+v"(v_c12),
[c13]"+v"(v_c13),
[c14]"+v"(v_c14),
[c15]"+v"(v_c15),
[c16]"+v"(v_c16),
[c17]"+v"(v_c17),
[c18]"+v"(v_c18),
[c19]"+v"(v_c19),
[c20]"+v"(v_c20),
[c21]"+v"(v_c21),
[c22]"+v"(v_c22),
[c23]"+v"(v_c23),
[c24]"+v"(v_c24),
[c25]"+v"(v_c25),
[c26]"+v"(v_c26),
[c27]"+v"(v_c27),
[c28]"+v"(v_c28),
[c29]"+v"(v_c29),
[c30]"+v"(v_c30),
[c31]"+v"(v_c31)
:
[sld_a_base]"n"(0),
[shfl_base]"n"(0),
[v_sld_y_os]"v"(sld_y_os),
[v_sfl_sld]"v"(sfl_sld),
[v_sfl_sst]"v"(sfl_sst),
[s_res_o0]"s"(res_o[0]),
[s_res_o1]"s"(res_o[1]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_o0]"v"(static_cast<index_t>(cached_coords_o[number<0>{}] * sizeof(ODataType))),
[v_os_o1]"v"(static_cast<index_t>(cached_coords_o[number<1>{}] * sizeof(ODataType))),
[v_os_o2]"v"(static_cast<index_t>(cached_coords_o[number<2>{}] * sizeof(ODataType))),
[v_os_o3]"v"(static_cast<index_t>(cached_coords_o[number<3>{}] * sizeof(ODataType))),
[v_os_o4]"v"(static_cast<index_t>(cached_coords_o[number<4>{}] * sizeof(ODataType))),
[v_os_o5]"v"(static_cast<index_t>(cached_coords_o[number<5>{}] * sizeof(ODataType))),
[v_os_o6]"v"(static_cast<index_t>(cached_coords_o[number<6>{}] * sizeof(ODataType))),
[v_os_o7]"v"(static_cast<index_t>(cached_coords_o[number<7>{}] * sizeof(ODataType))),
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[s_tile_os_o]"s"(tile_stride_o_bytes),
[s_tile_os_b]"s"(tile_stride_b_bytes),
[scale_0]"v"(s0),
[scale_1]"v"(s1),
[v_nan_lo]"v"(nan_lo),
[v_nan_hi]"v"(nan_hi),
[s_execflag_0]"s"(o_flags[number<0>{}]),
[s_execflag_1]"s"(o_flags[number<1>{}]),
[s_execflag_2]"s"(o_flags[number<2>{}]),
[s_execflag_3]"s"(o_flags[number<3>{}]),
[s_execflag_4]"s"(o_flags[number<4>{}]),
[s_execflag_5]"s"(o_flags[number<5>{}]),
[s_execflag_6]"s"(o_flags[number<6>{}]),
[s_execflag_7]"s"(o_flags[number<7>{}])
:
"memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255",
"s8", "s9", "s12", "s13", "s14", "s15", "s38", "s39", "s52", "s86",
"s36", "s37",
"v50", "v54", "v55",
"v64","v65","v66","v67","v68","v69","v70","v71",
"v72","v73","v74","v75","v76","v77","v78","v79",
"v80","v81","v82","v83","v84","v85","v86","v87",
"v88","v89","v90","v91","v92","v93","v94","v95",
"v128", "v129", "v130", "v131",
"v132", "v133", "v134", "v135", "v136", "v137", "v138", "v139",
"v140", "v141", "v142", "v143", "v144", "v145", "v146", "v147",
"v148", "v149", "v150", "v151", "v152", "v153", "v154", "v155",
"v156", "v157", "v158", "v159", "v160", "v161", "v162", "v163",
"v164", "v165", "v166", "v167", "v168", "v169", "v170", "v171",
"v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
"v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187",
"v188", "v189", "v190", "v191", "v192", "v193", "v194", "v195",
"v196", "v197", "v198", "v199", "v200", "v201", "v202", "v203",
"v204", "v205", "v206", "v207", "v208", "v209", "v210", "v211",
"v212", "v213", "v214", "v215", "v216", "v217", "v218", "v219",
"v220", "v221", "v222", "v223", "v224", "v225", "v226", "v227",
"v228", "v229", "v230", "v231", "v232", "v233", "v234", "v235",
"v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243",
"v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
"v252", "v253", "v254", "v255"
);
#pragma clang diagnostic pop
// clang-format on
}
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#define CK_TILE_FLATMM_UK_MFMA_FP16 0
#define CK_TILE_FLATMM_UK_MFMA_BF16 1
#define CK_TILE_FLATMM_UK_MFMA_INT8 2
#define CK_TILE_FLATMM_UK_MFMA_FP8 3
#define CK_TILE_FLATMM_UK_MFMA_BF8 4
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
# define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \
" v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
" v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \
" v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
" v_perm_b32 " y_ ", v55, v54, s52 \n"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, " x0_ " \n" \
" v_cvt_f16_f32 v55, " x1_ " \n" \
" v_pack_b32_f16 " y_ ", v54, v55 \n"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
";-------------------------------------------------------------\n"
" s_mov_b32 s52, 0x07060302 ; v_perm\n"
" s_mov_b64 s[38:39], exec ; save current exec\n"
" s_mov_b32 s8, %[s_res_o0] \n"
" s_mov_b32 s9, %[s_res_o1] \n"
" s_mov_b32 s12, %[s_res_b0] \n"
" s_mov_b32 s13, %[s_res_b1] \n"
" s_mov_b32 s14, %[s_res_b2] \n"
" s_mov_b32 s15, %[s_res_b3] \n"
" ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n"
" ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n"
" ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n"
" ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n"
" ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n"
" ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n"
" ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n"
" ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n"
" ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n"
" ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n"
" ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n"
" ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n"
" ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n"
" ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n"
" ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n"
" ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n"
" ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
" ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
" ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
" ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
" ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
" ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
" ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
" ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
" ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
" ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
" ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
" ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
" ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
" ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
" ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
" ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
" ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n"
" ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n"
" ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n"
" ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n"
" ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n"
" ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n"
" ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n"
" ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n"
" ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n"
" ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n"
" ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n"
" ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n"
" ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n"
" ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n"
" ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n"
" ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n"
" ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n"
" ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n"
" ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n"
" ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n"
" ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n"
" ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n"
" ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n"
" ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n"
" ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n"
" ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n"
" ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n"
" ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n"
" ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n"
" ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n"
" ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n"
" ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n"
" s_waitcnt 0 \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
" s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
" s_add_u32 s12, s86, s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_waitcnt 0 \n"
"L_start%=: \n"
" s_waitcnt vmcnt(32) \n"
" s_barrier \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n"
_UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]\n"
" v_mul_f32 %[c0], %[scale_0], %[c0] \n"
" v_mul_f32 %[c1], %[scale_0], %[c1] \n"
" v_mul_f32 %[c2], %[scale_0], %[c2] \n"
" v_mul_f32 %[c3], %[scale_0], %[c3] \n"
" v_mul_f32 %[c4], %[scale_1], %[c4] \n"
" v_mul_f32 %[c5], %[scale_1], %[c5] \n"
" v_mul_f32 %[c6], %[scale_1], %[c6] \n"
" v_mul_f32 %[c7], %[scale_1], %[c7] \n"
" v_mul_f32 %[c8], %[scale_0], %[c8] \n"
" v_mul_f32 %[c9], %[scale_0], %[c9] \n"
" v_mul_f32 %[c10], %[scale_0], %[c10] \n"
" v_mul_f32 %[c11], %[scale_0], %[c11] \n"
" v_mul_f32 %[c12], %[scale_1], %[c12] \n"
" v_mul_f32 %[c13], %[scale_1], %[c13] \n"
" v_mul_f32 %[c14], %[scale_1], %[c14] \n"
" v_mul_f32 %[c15], %[scale_1], %[c15] \n"
_UK_PK_CVT_("%[c0]", "%[c1]", "%[c0]")
_UK_PK_CVT_("%[c2]", "%[c3]", "%[c1]")
_UK_PK_CVT_("%[c4]", "%[c5]", "%[c2]")
_UK_PK_CVT_("%[c6]", "%[c7]", "%[c3]")
_UK_PK_CVT_("%[c8]", "%[c9]", "%[c4]")
_UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]")
_UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]")
_UK_PK_CVT_("%[c14]", "%[c15]", "%[c7]")
" ;------------------------------ \n"
" ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:0 + %[shfl_base] \n"
" ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base] \n"
" ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base] \n"
" ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base] \n"
" s_waitcnt lgkmcnt(0) \n"
" s_barrier \n"
" ds_read_b32 %[c0], %[v_sfl_sld] offset:0 + %[shfl_base] \n"
" ds_read_b32 %[c1], %[v_sfl_sld] offset:32 + %[shfl_base] \n"
" ds_read_b32 %[c2], %[v_sfl_sld] offset:64 + %[shfl_base] \n"
" ds_read_b32 %[c3], %[v_sfl_sld] offset:96 + %[shfl_base] \n"
" ds_read_b32 %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base] \n"
" ds_read_b32 %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base] \n"
" ds_read_b32 %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base] \n"
" ds_read_b32 %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base] \n"
" s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, %[s_execflag_0] \n"
_UK_ATOMIC_ADD_ " %[v_os_o0], %[c0], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_1] \n"
_UK_ATOMIC_ADD_ " %[v_os_o1], %[c1], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_2] \n"
_UK_ATOMIC_ADD_ " %[v_os_o2], %[c2], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_3] \n"
_UK_ATOMIC_ADD_ " %[v_os_o3], %[c3], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_4] \n"
_UK_ATOMIC_ADD_ " %[v_os_o4], %[c4], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_5] \n"
_UK_ATOMIC_ADD_ " %[v_os_o5], %[c5], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_6] \n"
_UK_ATOMIC_ADD_ " %[v_os_o6], %[c6], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_7] \n"
_UK_ATOMIC_ADD_ " %[v_os_o7], %[c7], s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k-- \n"
" s_cmp_gt_i32 %[s_loop_cnt] 0 \n"
" s_cbranch_scc0 L_end%= \n"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
" s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
" s_add_u32 s12, s86, s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_add_u32 s8, %[s_tile_os_o], s8 \n"
" s_addc_u32 s9, 0, s9 \n"
" s_waitcnt vmcnt(32) \n"
" s_barrier \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0 \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0 \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0 \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0 \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]] \n"
_UK_MFMA_ " [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]\n"
" v_mul_f32 %[c16], %[scale_0], %[c16] \n"
" v_mul_f32 %[c17], %[scale_0], %[c17] \n"
" v_mul_f32 %[c18], %[scale_0], %[c18] \n"
" v_mul_f32 %[c19], %[scale_0], %[c19] \n"
" v_mul_f32 %[c20], %[scale_1], %[c20] \n"
" v_mul_f32 %[c21], %[scale_1], %[c21] \n"
" v_mul_f32 %[c22], %[scale_1], %[c22] \n"
" v_mul_f32 %[c23], %[scale_1], %[c23] \n"
" v_mul_f32 %[c24], %[scale_0], %[c24] \n"
" v_mul_f32 %[c25], %[scale_0], %[c25] \n"
" v_mul_f32 %[c26], %[scale_0], %[c26] \n"
" v_mul_f32 %[c27], %[scale_0], %[c27] \n"
" v_mul_f32 %[c28], %[scale_1], %[c28] \n"
" v_mul_f32 %[c29], %[scale_1], %[c29] \n"
" v_mul_f32 %[c30], %[scale_1], %[c30] \n"
" v_mul_f32 %[c31], %[scale_1], %[c31] \n"
_UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]")
_UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]")
_UK_PK_CVT_("%[c20]", "%[c21]", "%[c18]")
_UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]")
_UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]")
_UK_PK_CVT_("%[c26]", "%[c27]", "%[c21]")
_UK_PK_CVT_("%[c28]", "%[c29]", "%[c22]")
_UK_PK_CVT_("%[c30]", "%[c31]", "%[c23]")
" ;------------------------------ \n"
" ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:0 + %[shfl_base] \n"
" ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base] \n"
" ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base] \n"
" ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base] \n"
" s_waitcnt lgkmcnt(0) \n"
" s_barrier \n"
" ds_read_b32 %[c16], %[v_sfl_sld] offset:0 + %[shfl_base] \n"
" ds_read_b32 %[c17], %[v_sfl_sld] offset:32 + %[shfl_base] \n"
" ds_read_b32 %[c18], %[v_sfl_sld] offset:64 + %[shfl_base] \n"
" ds_read_b32 %[c19], %[v_sfl_sld] offset:96 + %[shfl_base] \n"
" ds_read_b32 %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base] \n"
" ds_read_b32 %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base] \n"
" ds_read_b32 %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base] \n"
" ds_read_b32 %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base] \n"
" s_waitcnt lgkmcnt(0) \n"
" s_mov_b64 exec, %[s_execflag_0] \n"
_UK_ATOMIC_ADD_ " %[v_os_o0], %[c16], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_1] \n"
_UK_ATOMIC_ADD_ " %[v_os_o1], %[c17], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_2] \n"
_UK_ATOMIC_ADD_ " %[v_os_o2], %[c18], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_3] \n"
_UK_ATOMIC_ADD_ " %[v_os_o3], %[c19], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_4] \n"
_UK_ATOMIC_ADD_ " %[v_os_o4], %[c20], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_5] \n"
_UK_ATOMIC_ADD_ " %[v_os_o5], %[c21], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_6] \n"
_UK_ATOMIC_ADD_ " %[v_os_o6], %[c22], s[8:9] \n"
" s_mov_b64 exec, %[s_execflag_7] \n"
_UK_ATOMIC_ADD_ " %[v_os_o7], %[c23], s[8:9] \n"
" s_mov_b64 exec, s[38:39] \n"
" s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k-- \n"
" s_cmp_gt_i32 %[s_loop_cnt] 0 \n"
" s_cbranch_scc0 L_end%= \n"
" s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
" s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
" s_add_u32 s12, s86, s12 \n"
" s_addc_u32 s13, 0, s13 \n"
" s_add_u32 s8, %[s_tile_os_o], s8 \n"
" s_addc_u32 s9, 0, s9 \n"
" s_branch L_start%= \n"
"L_end%=: \n"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
# define _DEQUAN_CVT_(a, b, c) \
" v_cvt_f32_i32 a[0], a[0] \n" \
" v_cvt_f32_i32 a[1], a[1] \n" \
" v_cvt_f32_i32 a[2], a[2] \n" \
" v_cvt_f32_i32 a[3], a[3] \n" \
" v_mul_f32 a[0], v15, a[0] \n" \
" v_mul_f32 a[1], v15, a[1] \n" \
" v_mul_f32 a[2], v15, a[2] \n" \
" v_mul_f32 a[3], v15, a[3] \n" \
" v_mul_f32 a[0], v17, a[0] row_newbcast:12 \n" \
" v_mul_f32 a[1], v17, a[1] row_newbcast:13 \n" \
" v_mul_f32 a[2], v17, a[2] row_newbcast:14 \n" \
" v_mul_f32 a[3], v17, a[3] row_newbcast:15 \n" \
//////////GQ/DQ/GsmQ_addr///////////////
//expert weight addr no need
// s_mul_i32 s60, s3, 32 // 00000000056C: 923CA003 s3 s_tg_idy
// s_mul_i32 s60, 4, s60 // 000000000570: 923C3C84
// s_add_u32 s40, s60, s40 // 000000000574: 8028283C s40 sw_ptr
// s_addc_u32 s41, 0, s41 // 000000000578: 82292980 s41 sw_ptr
// v_and_b32 v54, 15, v0 // 00000000057C: 266C008F
// v_lshlrev_b32 v8, 2, v54 // 000000000580: 24106C82 v8/9 w addr
// v_add_u32 v9, 64, v8 // 000000000584: 681210C0
//GQDQ addr function kkkkkkkkkkkkkk
" v_lshrrev_b32 v54, 4, v0 \n"
" v_lshlrev_b32 v55, 2, v54 \n"
" v_and_b32 v54, 15, v0 \n"
" v_lshrrev_b32 v56, 2, v54 \n"
" v_lshlrev_b32 v56, 6, v56 \n"
" v_add_u32 v55, v56, v55 \n"
" v_and_b32 v54, 3, v0 \n"
" v_add_u32 v55, v54, v55 \n"
" v_lshlrev_b32 v10, 2, v55 \n"
" v_add_u32 v11, 0x00000400, v10 \n"
" s_mul_i32 s60, %[s_wave_id], 16 \n"
" s_mul_i32 s60, s60, 4 \n"
" v_add_u32 v10, s60, v10 \n"
" v_add_u32 v11, s60, v11 \n"
" v_mov_b32 v5, v10 \n"
//////////////////////////////
" s_mov_b32 s57, 0x00000100 \n"
" s_mov_b32 s58, 0x00001000 \n"
" s_mov_b32 s79, 0x00000400 \n"
" s_mov_b32 s59, 0x00000200 \n"
////////
//" s_mul_i32 s60, s70, 0x00000100 \n"
//" s_sub_u32 s56, s60, 0x00001000 \n"
///////////////
" s_mov_b32 s78, 0x00001000 \n"
" s_mov_b32 s52, 0x07060302 \n"
" s_mov_b32 s53, 0x00000400 \n"
" s_mov_b32 s54, 0x00040100 \n"
" s_mov_b32 s55, 0x04020100 \n"
" s_mov_b32 s6, 0x3fb8aa3b \n"
" s_mov_b32 s77, 0xbd92220c \n"
" v_mov_b32 v1, 0xbfcc4231 \n"
" v_mov_b32 v51, 0xffff0000 \n"
" v_mov_b32 v52, 0x7fff0000 \n"
" v_mov_b32 v53, 0x00007fff \n"
" s_waitcnt 0x0000 \n"
///XQ ADDR, fake token id
" v_mov_b32 %[v_token_id], %[v_token_id] \n"
" v_lshrrev_b32 v54, 24, %[v_token_id] \n"
" v_mul_i32_i24 v54, s66, v54 \n"
" v_and_b32 v55, 0x00ffffff, %[v_token_id] \n"
" v_add_u32 %[v_token_id], v54, v55 \n"
" v_lshrrev_b32 v54, 24, v7 \n"
" v_mul_i32_i24 v54, s66, v54 \n"
" v_and_b32 v55, 0x00ffffff, v7 \n"
" v_add_u32 v7, v54, v55 \n"
" v_lshlrev_b32 %[v_token_id], 2, %[v_token_id] \n"
" v_lshlrev_b32 v7, 2, v7 \n"
" buffer_load_dword v14, %[v_token_id], s[28:31], 0 offen \n"
" buffer_load_dword v15, v7, s[28:31], 0 offen \n"
" buffer_load_dword v16, v10, s[32:35], 0 offen \n"
" buffer_load_dword v17, v11, s[32:35], 0 offen \n"
" buffer_load_dword v18, v10, s[36:39], 0 offen \n"
" buffer_load_dword v19, v11, s[36:39], 0 offen \n"
" buffer_load_dword v20, v8, s[40:43], 0 offen \n"
" buffer_load_dword v21, v9, s[40:43], 0 offen \n"
"s_mov_b32 s16, %[s_res_dq0] \n"
"s_mov_b32 s17, %[s_res_dq1] \n"
"s_mov_b32 s18, %[s_res_dq2] \n"
"s_mov_b32 s19, %[s_res_dq3] \n"
"s_mov_b32 s32, %[s_res_gq0] \n"
"s_mov_b32 s33, %[s_res_gq1] \n"
"s_mov_b32 s34, %[s_res_gq2] \n"
"s_mov_b32 s35, %[s_res_gq3] \n"
"s_mov_b32 s20, %[s_res_a0] \n"
"s_mov_b32 s21, %[s_res_a1] \n"
"s_mov_b32 s22, %[s_res_a2] \n"
"s_mov_b32 s23, %[s_res_a3] \n"
"s_mov_b32 s24, %[s_res_b0] \n"
"s_mov_b32 s25, %[s_res_b1] \n"
"s_mov_b32 s26, %[s_res_b2] \n"
"s_mov_b32 s27, %[s_res_b3] \n"
" s_mov_b32 s80, 0 \n"
//---------------------v26-33 no need
// "s_nop 4\n"
"; -- prefetch A0\n"
"s_add_u32 m0, 0, %[s_m0_init] \n"
"buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a1], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a2], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a3], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a4], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a5], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a6], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a7], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[smem_sz], %[s_m0_init] \n"
" s_add_u32 s20, s57, s20 \n"
" s_addc_u32 s21, 0, s21 \n"
"; -- prefetch A1\n"
"buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a1], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a2], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a3], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a4], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a5], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a6], s[20:23], 0 offen lds \n"
"s_add_u32 m0, %[s_size_per_issue], m0 \n"
"buffer_load_dword %[v_os_a7], s[20:23], 0 offen lds \n"
"s_add_u32 m0, 0, %[s_m0_init] \n"
" s_add_u32 s20, s57, s20 \n"
" s_addc_u32 s21, 0, s21 \n"
"; -- prefetch B0\n"
"buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072 \n"
"buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen \n"
"buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024 \n"
"buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048 \n"
"buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072 \n"
"s_add_u32 s24, s58, s24 \n"
"s_addc_u32 s25, 0, s25 \n"
" s_waitcnt vmcnt(40) \n"
" s_barrier \n"
///////////////////////////////
"ds_read_b128 v[192:195], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n" // 1024: N stride, 64 K stride
"ds_read_b128 v[196:199], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n"
"ds_read_b128 v[200:203], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n"
"ds_read_b128 v[204:207], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n"
"ds_read_b128 v[208:211], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n"
"ds_read_b128 v[212:215], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n"
"ds_read_b128 v[216:219], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n"
"ds_read_b128 v[220:223], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n"
////////////////////////////
"label_start:
" s_waitcnt vmcnt(24) & lgkmcnt(0) \n"
" s_barrier \n"
_UK_MFMA_ " v[128:131], acc[0:1], v[192:193], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[2:3], v[194:195], v[128:131] \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[24:27], 0 offen \n"
_UK_MFMA_ " v[128:131], acc[4:5], v[196:197], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[6:7], v[198:199], v[128:131] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[128:131], acc[8:9], v[200:201], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[10:11], v[202:203], v[128:131] \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[128:131], acc[12:13], v[204:205], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[14:15], v[206:207], v[128:131] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[132:135], acc[0:1], v[208:209], v[132:135] \n"
_UK_MFMA_ " v[132:135], acc[2:3], v[210:211], v[132:135] \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[132:135], acc[4:5], v[212:213], v[132:135] \n"
_UK_MFMA_ " v[132:135], acc[6:7], v[214:215], v[132:135] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[132:135], acc[8:9], v[216:217], v[132:135] \n"
_UK_MFMA_ " v[132:135], acc[10:11], v[218:219], v[132:135] \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[132:135], acc[12:13], v[220:221], v[132:135] \n"
_UK_MFMA_ " v[132:135], acc[14:15], v[222:223], v[132:135] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[136:139], acc[16:17], v[192:193], v[136:139] \n"
_UK_MFMA_ " v[136:139], acc[18:19], v[194:195], v[136:139] \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[24:27], 0 offen \n"
_UK_MFMA_ " v[136:139], acc[20:21], v[196:197], v[136:139] \n"
_UK_MFMA_ " v[136:139], acc[22:23], v[198:199], v[136:139] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[136:139], acc[24:25], v[200:201], v[136:139] \n"
_UK_MFMA_ " v[136:139], acc[26:27], v[202:203], v[136:139] \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[136:139], acc[28:29], v[204:205], v[136:139] \n"
_UK_MFMA_ " v[136:139], acc[30:31], v[206:207], v[136:139] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[140:143], acc[16:17], v[208:209], v[140:143] \n"
_UK_MFMA_ " v[140:143], acc[18:19], v[210:211], v[140:143] \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[140:143], acc[20:21], v[212:213], v[140:143] \n"
_UK_MFMA_ " v[140:143], acc[22:23], v[214:215], v[140:143] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[140:143], acc[24:25], v[216:217], v[140:143] \n"
_UK_MFMA_ " v[140:143], acc[26:27], v[218:219], v[140:143] \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[140:143], acc[28:29], v[220:221], v[140:143] \n"
_UK_MFMA_ " v[140:143], acc[30:31], v[222:223], v[140:143] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[smem_sz], %[s_m0_init] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[144:147], acc[32:33], v[192:193], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[34:35], v[194:195], v[144:147] \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[24:27], 0 offen \n"
_UK_MFMA_ " v[144:147], acc[36:37], v[196:197], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[38:39], v[198:199], v[144:147] \n"
" ds_read_b128 v[224:227], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_0] \n"
_UK_MFMA_ " v[144:147], acc[40:41], v[200:201], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[42:43], v[202:203], v[144:147] \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[144:147], acc[44:45], v[204:205], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[46:47], v[206:207], v[144:147] \n"
" ds_read_b128 v[228:231], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_1] \n"
_UK_MFMA_ " v[148:151], acc[32:33], v[208:209], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[34:35], v[210:211], v[148:151] \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[148:151], acc[36:37], v[212:213], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[38:39], v[214:215], v[148:151] \n"
" ds_read_b128 v[232:235], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_2] \n"
_UK_MFMA_ " v[148:151], acc[40:41], v[216:217], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[42:43], v[218:219], v[148:151] \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[148:151], acc[44:45], v[220:221], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[46:47], v[222:223], v[148:151] \n"
" ds_read_b128 v[236:239], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_3] \n"
_UK_MFMA_ " v[152:155], acc[48:49], v[192:193], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[50:51], v[194:195], v[152:155] \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[24:27], 0 offen \n"
_UK_MFMA_ " v[152:155], acc[52:53], v[196:197], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[54:55], v[198:199], v[152:155] \n"
" ds_read_b128 v[240:243], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_4] \n"
_UK_MFMA_ " v[152:155], acc[56:57], v[200:201], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[58:59], v[202:203], v[152:155] \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[152:155], acc[60:61], v[204:205], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[62:63], v[206:207], v[152:155] \n"
" ds_read_b128 v[244:247], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_5] \n"
_UK_MFMA_ " v[156:159], acc[48:49], v[208:209], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[50:51], v[210:211], v[156:159] \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[156:159], acc[52:53], v[212:213], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[54:55], v[214:215], v[156:159] \n"
" ds_read_b128 v[248:251], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_6] \n"
_UK_MFMA_ " v[156:159], acc[56:57], v[216:217], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[58:59], v[218:219], v[156:159] \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[156:159], acc[60:61], v[220:221], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[62:63], v[222:223], v[156:159] \n"
" ds_read_b128 v[252:255], %[v_os_sld] offset:1*%[smem_sz] + %[sld_os_7] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[160:163], acc[64:65], v[192:193], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[66:67], v[194:195], v[160:163] \n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[24:27], 0 offen \n"
_UK_MFMA_ " v[160:163], acc[68:69], v[196:197], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[70:71], v[198:199], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[72:73], v[200:201], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[74:75], v[202:203], v[160:163] \n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[160:163], acc[76:77], v[204:205], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[78:79], v[206:207], v[160:163] \n"
_UK_MFMA_ " v[164:167], acc[64:65], v[208:209], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[66:67], v[210:211], v[164:167] \n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[164:167], acc[68:69], v[212:213], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[70:71], v[214:215], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[72:73], v[216:217], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[74:75], v[218:219], v[164:167] \n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[164:167], acc[76:77], v[220:221], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[78:79], v[222:223], v[164:167] \n"
_UK_MFMA_ " v[168:171], acc[80:81], v[192:193], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[82:83], v[194:195], v[168:171] \n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[24:27], 0 offen \n"
_UK_MFMA_ " v[168:171], acc[84:85], v[196:197], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[86:87], v[198:199], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[88:89], v[200:201], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[90:91], v[202:203], v[168:171] \n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[168:171], acc[92:93], v[204:205], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[94:95], v[206:207], v[168:171] \n"
_UK_MFMA_ " v[172:175], acc[80:81], v[208:209], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[82:83], v[210:211], v[172:175] \n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[172:175], acc[84:85], v[212:213], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[86:87], v[214:215], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[88:89], v[216:217], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[90:91], v[218:219], v[172:175] \n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[172:175], acc[92:93], v[220:221], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[94:95], v[222:223], v[172:175] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[176:179], acc[96:97], v[192:193], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[98:99], v[194:195], v[176:179] \n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[24:27], 0 offen \n"
_UK_MFMA_ " v[176:179], acc[100:101], v[196:197], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[102:103], v[198:199], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[104:105], v[200:201], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[106:107], v[202:203], v[176:179] \n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[176:179], acc[108:109], v[204:205], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[110:111], v[206:207], v[176:179] \n"
_UK_MFMA_ " v[180:183], acc[96:97], v[208:209], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[98:99], v[210:211], v[180:183] \n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[180:183], acc[100:101], v[212:213], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[102:103], v[214:215], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[104:105], v[216:217], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[106:107], v[218:219], v[180:183] \n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[180:183], acc[108:109], v[220:221], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[110:111], v[222:223], v[180:183] \n"
_UK_MFMA_ " v[184:187], acc[112:113], v[192:193], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[114:115], v[194:195], v[184:187] \n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[24:27], 0 offen \n"
_UK_MFMA_ " v[184:187], acc[116:117], v[196:197], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[118:119], v[198:199], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[120:121], v[200:201], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[122:123], v[202:203], v[184:187] \n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[184:187], acc[124:125], v[204:205], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[126:127], v[206:207], v[184:187] \n"
_UK_MFMA_ " v[188:191], acc[112:113], v[208:209], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[114:115], v[210:211], v[188:191] \n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[188:191], acc[116:117], v[212:213], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[118:119], v[214:215], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[120:121], v[216:217], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[122:123], v[218:219], v[188:191] \n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[188:191], acc[124:125], v[220:221], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[126:127], v[222:223], v[188:191] \n"
" s_add_u32 s60, 0x00000300, s80 \n"
" s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
" s_cselect_b32 s57, s57, 0 \n"
" s_add_u32 s60, 0x00000200, s80 \n"
" s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
" s_cselect_b32 s58, s58, 0 \n"
" s_add_u32 s20, s57, s20 \n"
" s_addc_u32 s21, 0, s21 \n"
" s_add_u32 s24, s58, s24 \n"
" s_addc_u32 s25, 0, s25 \n"
" s_addk_i32 s80, 0x0100 \n"
" s_cmp_lt_i32 s80, %[s_loop_cnt] \n"
" s_cbranch_scc0 label_end \n"
" s_waitcnt vmcnt(24) & lgkmcnt(0) \n"
" s_barrier \n"
_UK_MFMA_ " v[128:131], acc[128:129], v[224:225], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[130:131], v[226:227], v[128:131] \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen \n"
_UK_MFMA_ " v[128:131], acc[132:133], v[228:229], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[134:135], v[230:231], v[128:131] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[128:131], acc[136:137], v[232:233], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[138:139], v[234:235], v[128:131] \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[128:131], acc[140:141], v[236:237], v[128:131] \n"
_UK_MFMA_ " v[128:131], acc[142:143], v[238:239], v[128:131] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[132:135], acc[128:129], v[240:241], v[132:135] \n"
_UK_MFMA_ " v[132:135], acc[130:131], v[242:243], v[132:135] \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[132:135], acc[132:133], v[244:245], v[132:135] \n"
_UK_MFMA_ " v[132:135], acc[134:135], v[246:247], v[132:135] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[132:135], acc[136:137], v[248:249], v[132:135] \n"
_UK_MFMA_ " v[132:135], acc[138:139], v[250:251], v[132:135] \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[132:135], acc[140:141], v[252:253], v[132:135] \n"
_UK_MFMA_ " v[132:135], acc[142:143], v[254:255], v[132:135] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[136:139], acc[144:145], v[224:225], v[136:139] \n"
_UK_MFMA_ " v[136:139], acc[146:147], v[226:227], v[136:139] \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[24:27], 0 offen \n"
_UK_MFMA_ " v[136:139], acc[148:149], v[228:229], v[136:139] \n"
_UK_MFMA_ " v[136:139], acc[150:151], v[230:231], v[136:139] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[136:139], acc[152:153], v[232:233], v[136:139] \n"
_UK_MFMA_ " v[136:139], acc[154:155], v[234:235], v[136:139] \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[136:139], acc[156:157], v[236:237], v[136:139] \n"
_UK_MFMA_ " v[136:139], acc[158:159], v[238:239], v[136:139] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[140:143], acc[144:145], v[240:241], v[140:143] \n"
_UK_MFMA_ " v[140:143], acc[146:147], v[242:243], v[140:143] \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[140:143], acc[148:149], v[244:245], v[140:143] \n"
_UK_MFMA_ " v[140:143], acc[150:151], v[246:247], v[140:143] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, %[s_size_per_issue], m0 \n"
_UK_MFMA_ " v[140:143], acc[152:153], v[248:249], v[140:143] \n"
_UK_MFMA_ " v[140:143], acc[154:155], v[250:251], v[140:143] \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[140:143], acc[156:157], v[252:253], v[140:143] \n"
_UK_MFMA_ " v[140:143], acc[158:159], v[254:255], v[140:143] \n"
" buffer_load_dword s[20:23], 0 offen lds \n"
" s_add_u32 m0, 0, %[s_m0_init] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[144:147], acc[160:161], v[224:225], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[162:163], v[226:227], v[144:147] \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen \n"
_UK_MFMA_ " v[144:147], acc[164:165], v[228:229], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[166:167], v[230:231], v[144:147] \n"
" ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0]
_UK_MFMA_ " v[144:147], acc[168:169], v[232:233], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[170:171], v[234:235], v[144:147] \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[144:147], acc[172:173], v[236:237], v[144:147] \n"
_UK_MFMA_ " v[144:147], acc[174:175], v[238:239], v[144:147] \n"
" ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1]
_UK_MFMA_ " v[148:151], acc[160:161], v[240:241], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[162:163], v[242:243], v[148:151] \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[148:151], acc[164:165], v[244:245], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[166:167], v[246:247], v[148:151] \n"
" ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2]
_UK_MFMA_ " v[148:151], acc[168:169], v[248:249], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[170:171], v[250:251], v[148:151] \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[148:151], acc[172:173], v[252:253], v[148:151] \n"
_UK_MFMA_ " v[148:151], acc[174:175], v[254:255], v[148:151] \n"
" ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3]
_UK_MFMA_ " v[152:155], acc[176:177], v[224:225], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[178:179], v[226:227], v[152:155] \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen \n"
_UK_MFMA_ " v[152:155], acc[180:181], v[228:229], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[182:183], v[230:231], v[152:155] \n"
" ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4]
_UK_MFMA_ " v[152:155], acc[184:185], v[232:233], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[186:187], v[234:235], v[152:155] \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[152:155], acc[188:189], v[236:237], v[152:155] \n"
_UK_MFMA_ " v[152:155], acc[190:191], v[238:239], v[152:155] \n"
" ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5]
_UK_MFMA_ " v[156:159], acc[176:177], v[240:241], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[178:179], v[242:243], v[156:159] \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[156:159], acc[180:181], v[244:245], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[182:183], v[246:247], v[156:159] \n"
" ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6]
_UK_MFMA_ " v[156:159], acc[184:185], v[248:249], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[186:187], v[250:251], v[156:159] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[156:159], acc[188:189], v[252:253], v[156:159] \n"
_UK_MFMA_ " v[156:159], acc[190:191], v[254:255], v[156:159] \n"
" ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7]
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[160:163], acc[192:193], v[224:225], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[194:195], v[226:227], v[160:163] \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[24:27], 0 offen \n"
_UK_MFMA_ " v[160:163], acc[196:197], v[228:229], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[198:199], v[230:231], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[200:201], v[232:233], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[202:203], v[234:235], v[160:163] \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[160:163], acc[204:205], v[236:237], v[160:163] \n"
_UK_MFMA_ " v[160:163], acc[206:207], v[238:239], v[160:163] \n"
_UK_MFMA_ " v[164:167], acc[192:193], v[240:241], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[194:195], v[242:243], v[164:167] \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[164:167], acc[196:197], v[244:245], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[198:199], v[246:247], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[200:201], v[248:249], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[202:203], v[250:251], v[164:167] \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[164:167], acc[204:205], v[252:253], v[164:167] \n"
_UK_MFMA_ " v[164:167], acc[206:207], v[254:255], v[164:167] \n"
_UK_MFMA_ " v[168:171], acc[208:209], v[224:225], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[210:211], v[226:227], v[168:171] \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[24:27], 0 offen \n"
_UK_MFMA_ " v[168:171], acc[212:213], v[228:229], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[214:215], v[230:231], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[216:217], v[232:233], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[218:219], v[234:235], v[168:171] \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[168:171], acc[220:221], v[236:237], v[168:171] \n"
_UK_MFMA_ " v[168:171], acc[222:223], v[238:239], v[168:171] \n"
_UK_MFMA_ " v[172:175], acc[208:209], v[240:241], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[210:211], v[242:243], v[172:175] \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[172:175], acc[212:213], v[244:245], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[214:215], v[246:247], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[216:217], v[248:249], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[218:219], v[250:251], v[172:175] \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[172:175], acc[220:221], v[252:253], v[172:175] \n"
_UK_MFMA_ " v[172:175], acc[222:223], v[254:255], v[172:175] \n"
" s_waitcnt vmcnt(32) \n"
_UK_MFMA_ " v[176:179], acc[224:225], v[224:225], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[226:227], v[226:227], v[176:179] \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[24:27], 0 offen \n"
_UK_MFMA_ " v[176:179], acc[228:229], v[228:229], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[230:231], v[230:231], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[232:233], v[232:233], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[234:235], v[234:235], v[176:179] \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[176:179], acc[236:237], v[236:237], v[176:179] \n"
_UK_MFMA_ " v[176:179], acc[238:239], v[238:239], v[176:179] \n"
_UK_MFMA_ " v[180:183], acc[224:225], v[240:241], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[226:227], v[242:243], v[180:183] \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[180:183], acc[228:229], v[244:245], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[230:231], v[246:247], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[232:233], v[248:249], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[234:235], v[250:251], v[180:183] \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[180:183], acc[236:237], v[252:253], v[180:183] \n"
_UK_MFMA_ " v[180:183], acc[238:239], v[254:255], v[180:183] \n"
_UK_MFMA_ " v[184:187], acc[240:241], v[224:225], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[242:243], v[226:227], v[184:187] \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[24:27], 0 offen \n"
_UK_MFMA_ " v[184:187], acc[244:245], v[228:229], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[246:247], v[230:231], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[248:249], v[232:233], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[250:251], v[234:235], v[184:187] \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[24:27], 0 offen offset:1024 \n"
_UK_MFMA_ " v[184:187], acc[252:253], v[236:237], v[184:187] \n"
_UK_MFMA_ " v[184:187], acc[254:255], v[238:239], v[184:187] \n"
_UK_MFMA_ " v[188:191], acc[240:241], v[240:241], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[242:243], v[242:243], v[188:191] \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[24:27], 0 offen offset:2048 \n"
_UK_MFMA_ " v[188:191], acc[244:245], v[244:245], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[246:247], v[246:247], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[248:249], v[248:249], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[250:251], v[250:251], v[188:191] \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[24:27], 0 offen offset:3072 \n"
_UK_MFMA_ " v[188:191], acc[252:253], v[252:253], v[188:191] \n"
_UK_MFMA_ " v[188:191], acc[254:255], v[254:255], v[188:191] \n"
" s_add_u32 s60, 0x00000300, s80 \n"
" s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
" s_cselect_b32 s57, s57, 0 \n"
" s_add_u32 s60, 0x00000200, s80 \n"
" s_cmp_lt_u32 s60, %[s_loop_cnt] \n"
" s_cselect_b32 s58, s58, 0 \n"
" s_add_u32 s20, s57, s20 \n"
" s_addc_u32 s21, 0, s21 \n"
" s_add_u32 s24, s58, s24 \n"
" s_addc_u32 s25, 0, s25 \n"
" s_addk_i32 s80, 0x0100 \n"
" s_cmp_lt_i32 s80, %[s_loop_cnt] \n"
" s_cbranch_scc0 label_end \n"
" s_branch label_start%= \n"
" label_end : \n"
//dequant
" v_cvt_f32_i32 v128, v128 \n"
" v_cvt_f32_i32 v129, v129 \n"
" v_cvt_f32_i32 v130, v130 \n"
" v_cvt_f32_i32 v131, v131 \n"
" v_mul_f32 v128, v14, v128 \n"
" v_mul_f32 v129, v14, v129 \n"
" v_mul_f32 v130, v14, v130 \n"
" v_mul_f32 v131, v14, v131 \n"
" v_mul_f32 v128, v16, v128 row_newbcast:0 \n"
" v_mul_f32 v129, v16, v129 row_newbcast:1 \n"
" v_mul_f32 v130, v16, v130 row_newbcast:2 \n"
" v_mul_f32 v131, v16, v131 row_newbcast:3 \n"
" v_cvt_f32_i32 v132, v132 \n"
" v_cvt_f32_i32 v133, v133 \n"
" v_cvt_f32_i32 v134, v134 \n"
" v_cvt_f32_i32 v135, v135 \n"
" v_mul_f32 v132, v15, v132 \n"
" v_mul_f32 v133, v15, v133 \n"
" v_mul_f32 v134, v15, v134 \n"
" v_mul_f32 v135, v15, v135 \n"
" v_mul_f32 v132, v16, v132 row_newbcast:0 \n"
" v_mul_f32 v133, v16, v133 row_newbcast:1 \n"
" v_mul_f32 v134, v16, v134 row_newbcast:2 \n"
" v_mul_f32 v135, v16, v135 row_newbcast:3 \n"
" v_cvt_f32_i32 v136, v136 \n"
" v_cvt_f32_i32 v137, v137 \n"
" v_cvt_f32_i32 v138, v138 \n"
" v_cvt_f32_i32 v139, v139 \n"
" v_mul_f32 v136, v14, v136 \n"
" v_mul_f32 v137, v14, v137 \n"
" v_mul_f32 v138, v14, v138 \n"
" v_mul_f32 v139, v14, v139 \n"
" v_mul_f32 v136, v16, v136 row_newbcast:4 \n"
" v_mul_f32 v137, v16, v137 row_newbcast:5 \n"
" v_mul_f32 v138, v16, v138 row_newbcast:6 \n"
" v_mul_f32 v139, v16, v139 row_newbcast:7 \n"
" v_cvt_f32_i32 v140, v140 \n"
" v_cvt_f32_i32 v141, v141 \n"
" v_cvt_f32_i32 v142, v142 \n"
" v_cvt_f32_i32 v143, v143 \n"
" v_mul_f32 v140, v15, v140 \n"
" v_mul_f32 v141, v15, v141 \n"
" v_mul_f32 v142, v15, v142 \n"
" v_mul_f32 v143, v15, v143 \n"
" v_mul_f32 v140, v16, v140 row_newbcast:4 \n"
" v_mul_f32 v141, v16, v141 row_newbcast:5 \n"
" v_mul_f32 v142, v16, v142 row_newbcast:6 \n"
" v_mul_f32 v143, v16, v143 row_newbcast:7 \n"
" v_cvt_f32_i32 v144, v144 \n"
" v_cvt_f32_i32 v145, v145 \n"
" v_cvt_f32_i32 v146, v146 \n"
" v_cvt_f32_i32 v147, v147 \n"
" v_mul_f32 v144, v14, v144 \n"
" v_mul_f32 v145, v14, v145 \n"
" v_mul_f32 v146, v14, v146 \n"
" v_mul_f32 v147, v14, v147 \n"
" v_mul_f32 v144, v16, v144 row_newbcast:8 \n"
" v_mul_f32 v145, v16, v145 row_newbcast:9 \n"
" v_mul_f32 v146, v16, v146 row_newbcast:10 \n"
" v_mul_f32 v147, v16, v147 row_newbcast:11 \n"
" v_cvt_f32_i32 v148, v148 \n"
" v_cvt_f32_i32 v149, v149 \n"
" v_cvt_f32_i32 v150, v150 \n"
" v_cvt_f32_i32 v151, v151 \n"
" v_mul_f32 v148, v15, v148 \n"
" v_mul_f32 v149, v15, v149 \n"
" v_mul_f32 v150, v15, v150 \n"
" v_mul_f32 v151, v15, v151 \n"
" v_mul_f32 v148, v16, v148 row_newbcast:8 \n"
" v_mul_f32 v149, v16, v149 row_newbcast:9 \n"
" v_mul_f32 v150, v16, v150 row_newbcast:10 \n"
" v_mul_f32 v151, v16, v151 row_newbcast:11 \n"
" v_cvt_f32_i32 v152, v152 \n"
" v_cvt_f32_i32 v153, v153 \n"
" v_cvt_f32_i32 v154, v154 \n"
" v_cvt_f32_i32 v155, v155 \n"
" v_mul_f32 v152, v14, v152 \n"
" v_mul_f32 v153, v14, v153 \n"
" v_mul_f32 v154, v14, v154 \n"
" v_mul_f32 v155, v14, v155 \n"
" v_mul_f32 v152, v16, v152 row_newbcast:12 \n"
" v_mul_f32 v153, v16, v153 row_newbcast:13 \n"
" v_mul_f32 v154, v16, v154 row_newbcast:14 \n"
" v_mul_f32 v155, v16, v155 row_newbcast:15 \n"
" v_cvt_f32_i32 v156, v156 \n"
" v_cvt_f32_i32 v157, v157 \n"
" v_cvt_f32_i32 v158, v158 \n"
" v_cvt_f32_i32 v159, v159 \n"
" v_mul_f32 v156, v15, v156 \n"
" v_mul_f32 v157, v15, v157 \n"
" v_mul_f32 v158, v15, v158 \n"
" v_mul_f32 v159, v15, v159 \n"
" v_mul_f32 v156, v16, v156 row_newbcast:12 \n"
" v_mul_f32 v157, v16, v157 row_newbcast:13 \n"
" v_mul_f32 v158, v16, v158 row_newbcast:14 \n"
" v_mul_f32 v159, v16, v159 row_newbcast:15 \n"
" v_cvt_f32_i32 v160, v160 \n"
" v_cvt_f32_i32 v161, v161 \n"
" v_cvt_f32_i32 v162, v162 \n"
" v_cvt_f32_i32 v163, v163 \n"
" v_mul_f32 v160, v14, v160 \n"
" v_mul_f32 v161, v14, v161 \n"
" v_mul_f32 v162, v14, v162 \n"
" v_mul_f32 v163, v14, v163 \n"
" v_mul_f32 v160, v17, v160 row_newbcast:0 \n"
" v_mul_f32 v161, v17, v161 row_newbcast:1 \n"
" v_mul_f32 v162, v17, v162 row_newbcast:2 \n"
" v_mul_f32 v163, v17, v163 row_newbcast:3 \n"
" v_cvt_f32_i32 v164, v164 \n"
" v_cvt_f32_i32 v165, v165 \n"
" v_cvt_f32_i32 v166, v166 \n"
" v_cvt_f32_i32 v167, v167 \n"
" v_mul_f32 v164, v15, v164 \n"
" v_mul_f32 v165, v15, v165 \n"
" v_mul_f32 v166, v15, v166 \n"
" v_mul_f32 v167, v15, v167 \n"
" v_mul_f32 v164, v17, v164 row_newbcast:0 \n"
" v_mul_f32 v165, v17, v165 row_newbcast:1 \n"
" v_mul_f32 v166, v17, v166 row_newbcast:2 \n"
" v_mul_f32 v167, v17, v167 row_newbcast:3 \n"
" v_cvt_f32_i32 v168, v168 \n"
" v_cvt_f32_i32 v169, v169 \n"
" v_cvt_f32_i32 v170, v170 \n"
" v_cvt_f32_i32 v171, v171 \n"
" v_mul_f32 v168, v14, v168 \n"
" v_mul_f32 v169, v14, v169 \n"
" v_mul_f32 v170, v14, v170 \n"
" v_mul_f32 v171, v14, v171 \n"
" v_mul_f32 v168, v17, v168 row_newbcast:4 \n"
" v_mul_f32 v169, v17, v169 row_newbcast:5 \n"
" v_mul_f32 v170, v17, v170 row_newbcast:6 \n"
" v_mul_f32 v171, v17, v171 row_newbcast:7 \n"
" v_cvt_f32_i32 v172, v172 \n"
" v_cvt_f32_i32 v173, v173 \n"
" v_cvt_f32_i32 v174, v174 \n"
" v_cvt_f32_i32 v175, v175 \n"
" v_mul_f32 v172, v15, v172 \n"
" v_mul_f32 v173, v15, v173 \n"
" v_mul_f32 v174, v15, v174 \n"
" v_mul_f32 v175, v15, v175 \n"
" v_mul_f32 v172, v17, v172 row_newbcast:4 \n"
" v_mul_f32 v173, v17, v173 row_newbcast:5 \n"
" v_mul_f32 v174, v17, v174 row_newbcast:6 \n"
" v_mul_f32 v175, v17, v175 row_newbcast:7 \n"
" v_cvt_f32_i32 v176, v176 \n"
" v_cvt_f32_i32 v177, v177 \n"
" v_cvt_f32_i32 v178, v178 \n"
" v_cvt_f32_i32 v179, v179 \n"
" v_mul_f32 v176, v14, v176 \n"
" v_mul_f32 v177, v14, v177 \n"
" v_mul_f32 v178, v14, v178 \n"
" v_mul_f32 v179, v14, v179 \n"
" v_mul_f32 v176, v17, v176 row_newbcast:8 \n"
" v_mul_f32 v177, v17, v177 row_newbcast:9 \n"
" v_mul_f32 v178, v17, v178 row_newbcast:10 \n"
" v_mul_f32 v179, v17, v179 row_newbcast:11 \n"
" v_cvt_f32_i32 v180, v180 \n"
" v_cvt_f32_i32 v181, v181 \n"
" v_cvt_f32_i32 v182, v182 \n"
" v_cvt_f32_i32 v183, v183 \n"
" v_mul_f32 v180, v15, v180 \n"
" v_mul_f32 v181, v15, v181 \n"
" v_mul_f32 v182, v15, v182 \n"
" v_mul_f32 v183, v15, v183 \n"
" v_mul_f32 v180, v17, v180 row_newbcast:8 \n"
" v_mul_f32 v181, v17, v181 row_newbcast:9 \n"
" v_mul_f32 v182, v17, v182 row_newbcast:10 \n"
" v_mul_f32 v183, v17, v183 row_newbcast:11 \n"
" v_cvt_f32_i32 v184, v184 \n"
" v_cvt_f32_i32 v185, v185 \n"
" v_cvt_f32_i32 v186, v186 \n"
" v_cvt_f32_i32 v187, v187 \n"
" v_mul_f32 v184, v14, v184 \n"
" v_mul_f32 v185, v14, v185 \n"
" v_mul_f32 v186, v14, v186 \n"
" v_mul_f32 v187, v14, v187 \n"
" v_mul_f32 v184, v17, v184 row_newbcast:12 \n"
" v_mul_f32 v185, v17, v185 row_newbcast:13 \n"
" v_mul_f32 v186, v17, v186 row_newbcast:14 \n"
" v_mul_f32 v187, v17, v187 row_newbcast:15 \n"
" v_cvt_f32_i32 v188, v188 \n"
" v_cvt_f32_i32 v189, v189 \n"
" v_cvt_f32_i32 v190, v190 \n"
" v_cvt_f32_i32 v191, v191 \n"
" v_mul_f32 v188, v15, v188 \n"
" v_mul_f32 v189, v15, v189 \n"
" v_mul_f32 v190, v15, v190 \n"
" v_mul_f32 v191, v15, v191 \n"
" v_mul_f32 v188, v17, v188 row_newbcast:12 \n"
" v_mul_f32 v189, v17, v189 row_newbcast:13 \n"
" v_mul_f32 v190, v17, v190 row_newbcast:14 \n"
" v_mul_f32 v191, v17, v191 row_newbcast:15 \n"
#undef _UK_MFMA_
//dequant end
......@@ -310,7 +310,7 @@ struct FusedMoeGemmPipeline_FlatmmUk
row_coords_o, reinterpret_cast<const TopkWeightDataType*>(kargs.sorted_weight_ptr));
auto uk_0 = Policy::template GetUK_0<Problem>();
auto acc_0 = uk_0(a_res,
auto acc_0= uk_0(a_res,
a_coords,
g_res,
g_coords,
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp"
namespace ck_tile {
/*
This pipeline deal with a gemm(actually 2 gemm) with one very small(token), one very big(weight)
we need to design the pipeline such that all waves along gemm-N dim (gemm-m only 1 wave)
<----- gemm-N ------>
+----+----+----+----+
| w0 | w1 | w2 | w3 | gemm-m
+----+----+----+----+
*/
template <typename Problem_, typename Policy_ = FusedMoeGemmPipelineFlatmmPolicy>
struct FusedMoeGemmPipeline_FlatmmUk_int8
{
using Problem = remove_cvref_t<Problem_>;
using Policy = remove_cvref_t<Policy_>;
using BlockShape = typename Problem::BlockShape; // this is FusedMoeGemmShape
using ADataType = typename Problem::ADataType;
using GDataType = typename Problem::GDataType;
using DDataType = typename Problem::DDataType;
using AccDataType = typename Problem::AccDataType;
using ODataType = typename Problem::ODataType;
using AScaleDataType = typename Problem::AScaleDataType;
using GScaleDataType = typename Problem::GScaleDataType;
using DScaleDataType = typename Problem::DScaleDataType;
using YSmoothScaleDataType = typename Problem::YSmoothScaleDataType;
using TopkWeightDataType = typename Problem::TopkWeightDataType;
using IndexDataType = typename Problem::IndexDataType;
using YDataType = typename Problem::YDataType;
using Traits = typename Problem::Traits;
static constexpr bool IsGateOnly = Traits::IsGateOnly;
static constexpr bool UseSmoothQuant = Traits::UseSmoothQuant;
static constexpr bool PadHiddenSize = Traits::PadHiddenSize;
static constexpr bool PadIntermediateSize = Traits::PadIntermediateSize;
static constexpr index_t kAlignmentA = Policy::template GetAlignment_A<Problem>();
static constexpr index_t kAlignmentG = Policy::template GetAlignment_G<Problem>();
static constexpr index_t kAlignmentD = Policy::template GetAlignment_D<Problem>();
static constexpr index_t kAlignmentO = Policy::template GetAlignment_O<Problem>();
static constexpr index_t SLD_A = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::SLD_A);
static constexpr index_t GLD_A = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GLD_A);
static constexpr index_t GLD_B = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GLD_B);
static constexpr index_t GST_O = static_cast<index_t>(FusedMoeGemmPipelineSequencerEnum::GST_O);
static constexpr index_t kBlockPerCu = []() {
if constexpr(Problem::kBlockPerCu != -1)
return Problem::kBlockPerCu;
else
{
// minimize occupancy
return 2;
}
}();
static constexpr const char* name = "flatmm_uk";
CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
{
constexpr index_t smem_0 = Policy::template GetUK_0<Problem>().GetSmemSize();
constexpr index_t smem_1 = Policy::template GetUK_1<Problem>().GetSmemSize();
constexpr index_t smem_bridge =
BlockShape::Block_M0 * BlockShape::Block_N0 * sizeof(YDataType);
return max(smem_0, max(smem_1, smem_bridge));
}
// this is the thread-offset along row/col
CK_TILE_HOST_DEVICE static auto GetACoord()
{
constexpr auto a_dist = Policy::template MakeGlobalTileDistribution_A<Problem>();
const auto a_coord = a_dist.calculate_index();
return a_coord;
}
// this is the thread-offset along row/col
CK_TILE_HOST_DEVICE static auto GetOCoord()
{
constexpr auto o_dist = Policy::template MakeOGlobalTileDistribution<Problem>();
const auto o_coord = o_dist.calculate_index();
return o_coord;
}
CK_TILE_DEVICE constexpr auto GetNumRowCoords_A()
{
constexpr index_t KLans = BlockShape::Block_K0 / kAlignmentA;
constexpr index_t MLans = BlockShape::BlockSize / KLans;
constexpr index_t MRepeat = BlockShape::Block_M0 / MLans;
return MRepeat;
}
// TODO: properlly support scatter/gather
CK_TILE_DEVICE auto GetRowCoords_A(index_t base_offset)
{
constexpr index_t KLans = BlockShape::Block_K0 / kAlignmentA;
constexpr index_t MLans = BlockShape::BlockSize / KLans;
constexpr index_t MRepeat = BlockShape::Block_M0 / MLans;
auto base_coord = threadIdx.x / KLans + base_offset;
array<index_t, MRepeat> coords;
static_for<0, MRepeat, 1>{}([&](auto i) { coords.at(i) = base_coord + i * MLans; });
return coords;
}
template <typename ROW_COORDS>
CK_TILE_DEVICE auto GetRowID(const ROW_COORDS coords, const IndexDataType* sorted_token_ids_ptr)
{
constexpr index_t n_size = coords.size();
array<index_t, n_size> row_ids;
static_for<0, n_size, 1>{}([&](auto i) {
row_ids.at(i) = sorted_token_ids_ptr[coords[i]]; // base_coord + i * MLans;
});
return row_ids;
}
template <typename ROW_COORDS>
CK_TILE_DEVICE auto GetWeightScale(const ROW_COORDS coords,
const TopkWeightDataType* sorted_weight_ptr)
{
constexpr index_t n_size = coords.size();
array<TopkWeightDataType, n_size> w;
static_for<0, n_size, 1>{}([&](auto i) {
w.at(i) = sorted_weight_ptr[coords[i]]; // base_coord + i * MLans;
});
return w;
}
// TODO: this row id is before shuffle atomic, need use acc distribution
CK_TILE_DEVICE auto GetRowCoords_O(index_t base_offset)
{
constexpr index_t MLanes = BlockShape::Warp_M1;
constexpr index_t Repeat_M = BlockShape::Repeat_M1;
auto base_coord = threadIdx.x % MLanes + base_offset;
array<index_t, Repeat_M> coords;
static_for<0, Repeat_M, 1>{}([&](auto i) { coords.at(i) = base_coord + i * MLanes; });
return coords;
}
template <typename Karg>
CK_TILE_DEVICE auto operator()(const Karg& kargs,
CK_TILE_LDS_ADDR void* smem,
index_t sorted_tile_id,
index_t intermediate_tile_id)
{
constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2;
ck_tile::index_t shared_intermediate_size_0 = kargs.intermediate_size;
ck_tile::index_t shared_intermediate_size_1 = kargs.intermediate_size / hidden_radio_0;
index_t nr_0 = shared_intermediate_size_0 / BlockShape::Warp_N0; // divide N in W
index_t kr_0 = kargs.hidden_size / BlockShape::Warp_K0; // divide K in W
index_t nr_1 = kargs.hidden_size / BlockShape::Warp_N1;
index_t kr_1 = shared_intermediate_size_1 / BlockShape::Warp_K1;
const IndexDataType expert_id = __builtin_amdgcn_readfirstlane(
reinterpret_cast<const IndexDataType*>(kargs.sorted_expert_ids_ptr)[sorted_tile_id]);
index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size;
index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size;
/////////////
index_t a_scale_expert_stride_0 = kargs.hidden_size;
index_t g_scale_expert_stride_0 = shared_intermediate_size_0;
index_t d_scale_expert_stride_1 = kargs.hidden_size;
// nr*kr*w
index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane(
intermediate_tile_id *
BlockShape::Block_Nr0); // intermediate_tile_id * Block_N / (N in W)
index_t interm_idx_kr1 = __builtin_amdgcn_readfirstlane(
intermediate_tile_id *
BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W)
auto row_coords_a = GetRowCoords_A(sorted_tile_id * BlockShape::Block_M0);
auto row_ids_a = GetRowID(
row_coords_a, reinterpret_cast<const IndexDataType*>(kargs.sorted_token_ids_ptr));
auto token_id = row_ids_a & 0xffffff;
//addr in fact
auto a_coords = generate_tuple(
[&](auto i) {
return (token_id) * kargs.stride_token +
threadIdx.x % (BlockShape::Block_K0 / kAlignmentA) * kAlignmentA;
},
number<row_ids_a.size()>{});
auto a_res =
make_wave_buffer_resource(reinterpret_cast<const ADataType*>(kargs.a_ptr),
kargs.num_tokens * kargs.stride_token * sizeof(ADataType));
//////aq
auto aq_win = [&]() {
const AScaleDataType* aq_ptr = reinterpret_cast<const AScaleDataType*>(kargs.a_scale_ptr);
auto aq_view_ = make_naive_tensor_view<address_space_enum::global>(
aq_ptr,
make_tuple(kargs.num_tokens * kargs.topk),
number<1>{});
return aq_view_;
}();
auto aq_res = aq_win.get_buffer_view().cached_buf_res_;
////////
auto g_win = [&]() {
const GDataType* g_ptr = reinterpret_cast<const GDataType*>(kargs.g_ptr) +
static_cast<long_index_t>(expert_id) * expert_stride_0 +
interm_idx_nr0 * kr_0 * BlockShape::Block_W0;
auto g_view_ = make_naive_tensor_view<address_space_enum::global>(
g_ptr,
make_tuple(nr_0, kr_0, number<BlockShape::Block_W0>{}),
make_tuple(kr_0 * BlockShape::Block_W0, number<BlockShape::Block_W0>{}, 1),
number<kAlignmentG>{},
number<1>{});
auto g_window_ = make_tile_window_linear_raw(
g_view_,
make_tuple(number<BlockShape::Block_Nr0>{},
number<BlockShape::Block_Kr0>{},
number<BlockShape::Block_W0>{}),
{0, 0, 0},
Policy::template MakeGlobalTileDistribution_G<Problem>(),
sequence<0, 1, 1>{});
return g_window_;
}();
auto g_res = g_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_;
auto g_coords = generate_tuple([&](auto i) { return g_win.cached_coords_[i].get_offset(); },
number<decltype(g_win)::NumAccess_NonLinear>{});
//////gq
auto gq_win = [&]() {
const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr) +
static_cast<long_index_t>(expert_id) * g_scale_expert_stride_0 +
intermediate_tile_id * BlockShape::Block_N0;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline
auto g_view_ = make_naive_tensor_view<address_space_enum::global>(
g_ptr,
make_tuple(shared_intermediate_size_1),
number<1>{});
return g_view_;
}();
auto gq_res = gq_win.get_buffer_view().cached_buf_res_;
////
const auto d_win = [&]() {
const DDataType* d_ptr = reinterpret_cast<const DDataType*>(kargs.d_ptr) +
static_cast<long_index_t>(expert_id) * expert_stride_1 +
interm_idx_kr1 * BlockShape::Block_W1;
// note interm_idx_nr0 is along the gemm-k dim of 2nd gemm
const auto d_view_ = make_naive_tensor_view<address_space_enum::global>(
d_ptr,
make_tuple(nr_1, kr_1, BlockShape::Block_W1),
make_tuple(kr_1 * BlockShape::Block_W1, BlockShape::Block_W1, 1),
number<kAlignmentD>{},
number<1>{});
const auto d_window_ = make_tile_window_linear_raw(
d_view_,
make_tuple(number<BlockShape::Block_Nr1>{},
number<BlockShape::Block_Kr1>{},
number<BlockShape::Block_W1>{}),
{0, 0, 0},
Policy::template MakeGlobalTileDistribution_D<Problem>(),
sequence<0, 1, 1>{});
return d_window_;
}();
auto d_res = d_win.get_bottom_tensor_view().get_buffer_view().cached_buf_res_;
//////gq
auto dq_win = [&]() {
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.d_scale_ptr) + static_cast<long_index_t>(expert_id) * d_scale_expert_stride_0;
const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.d_scale_ptr)//remember to add expert_id as expert_idx
auto g_view_ = make_naive_tensor_view<address_space_enum::global>(
g_ptr,
make_tuple(kargs.hidden_size),
number<1>{});
return g_view_;
}();
auto dq_res = dq_win.get_buffer_view().cached_buf_res_;
////
// TODO: load D order is N0.K0...127, N64.K0...127, N0.K128...255, N64.K128...255
// block-k=512, block-n=128
// wg |<----- W_ ----->|
// Nr(2)*Nw(4)* Kr *Kr0(4)*Kr1(4) * [Kl(4)*Nl(16)*Kv(8)]->one issue
// y p y y p p y
// 1 2 0(imm)
auto d_coords = [&]() {
constexpr index_t Nr_ = 4;
constexpr index_t Nw_ = 4;
constexpr index_t Kr0_ = 2;//no more need in int8, method changed, this will be handed in res_s
constexpr index_t Kr1_ = 4;
constexpr index_t Kl_ = 4;
constexpr index_t Nl_ = 16;
constexpr index_t Kv_ = 16;
constexpr index_t W_ = Kl_ * Nl_ * Kv_;
//constexpr index_t num_offsets_ = Nr_ * Kr0_;
constexpr index_t num_offsets_ = Nr_ ;
index_t base_os_ = (threadIdx.x % 64) * Kv_ + (threadIdx.x / 64) *
shared_intermediate_size_1 *
Nl_; // Kr0_ * Kr1_ * W_;
return generate_tuple(
[&](auto i) {
constexpr auto i_nr_ = number<i % Nr_>{};
return i_nr_ * shared_intermediate_size_1 * Nw_ * Nl_ +
base_os_;
},
number<num_offsets_>{});
}();
auto o_coords = generate_tuple(
[&](auto i) {
return token_id * kargs.stride_token +
threadIdx.x % (BlockShape::Block_N1 / kAlignmentO) * kAlignmentO;
},
number<row_ids_a.size()>{});
auto o_flags =
generate_tuple([&](auto i) { return cmp_lt_to_exec(token_id, kargs.num_tokens); },
number<row_ids_a.size()>{});
auto bridge_sst_win = [&]() {
constexpr auto desc_ = Policy::template MakeBridgeLdsStoreForUKDesc<Problem>();
constexpr auto dist_ = Policy::template GetUK_0<Problem>().MakeCBlockDist();
return make_tile_window_linear(make_tensor_view<address_space_enum::lds>(
reinterpret_cast<YDataType*>(smem), desc_),
desc_.get_lengths(),
{0, 0},
dist_);
}();
auto o_res =
make_wave_buffer_resource(reinterpret_cast<const ODataType*>(kargs.o_ptr),
kargs.num_tokens * kargs.stride_token * sizeof(ODataType));
auto row_coords_o = GetRowCoords_O(sorted_tile_id * BlockShape::Block_M0);
auto w_scale = GetWeightScale(
row_coords_o, reinterpret_cast<const TopkWeightDataType*>(kargs.sorted_weight_ptr));
auto uk_0 = Policy::template GetUK_0<Problem>();
auto acc_0= uk_0(
row_ids_a,//fake token id, 2D index for X scale
aq_res,
gq_res,
dq_res,
a_res,
a_coords,
g_res,
g_coords,
smem,
kargs.hidden_size,
BlockShape::Block_K0, // tile offset for B matrix each unroll
BlockShape::Block_Kr0 *
BlockShape::Block_W0); // tile offset for B matrix each unroll
// sweep_tile(
// acc_0,
// [&](auto idx0, auto idx1) {
// fp32x2_t v_{acc_0(idx0), acc_0(idx1)};
// typename Problem::GateActivation{}(v_, v_);
// acc_0(idx0) = v_.x;
// acc_0(idx1) = v_.y;
// },
// sequence<1, 2>{});
// auto y_pre = cast_tile<YDataType>(acc_0);
// block_sync_lds();
// store_tile(bridge_sst_win, y_pre);
// block_sync_lds();
auto uk_1 = Policy::template GetUK_1<Problem>();
uk_1(d_res,
d_coords,
o_res,
o_coords,
o_flags,
smem,
kargs.hidden_size, // total n number
w_scale,
shared_intermediate_size_1 * Block_N1 - kr_1 * BlockShape::Block_W1, // along N
kr_1 * BlockShape::Block_W1,
BlockShape::Block_N1); // along N
}
};
} // namespace ck_tile
......@@ -140,5 +140,10 @@ using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
WarpGemmAttributeMfmaImpl_f32_32x32x16_f8_base<fp8_t, fp8_t, WGAttrCtlEnum::Default_>,
2,
swizzle_factor>>;
// int8
using WarpGemmMfma_i32_16x16x64_int8_int8_CTransposed =
WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>
2>>;
} // namespace ck_tile
......@@ -618,6 +618,72 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
return c_vec;
}
};
template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
struct WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
{
static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
using ADataType = int8_t;
using BDataType = int8_t;
using CDataType = int32_t;
using AVecType = ext_vector_t<ADataType, 8>;
using BVecType = ext_vector_t<BDataType, 8>;
using CVecType = ext_vector_t<CDataType, 4>;
static constexpr index_t kM = 16;
static constexpr index_t kN = 16;
static constexpr index_t kK = 32;
static constexpr index_t kAMLane = 16;
static constexpr index_t kBNLane = 16;
static constexpr index_t kABKLane = 4;
static constexpr index_t kABKPerLane = 8;
static constexpr index_t kCMLane = 4;
static constexpr index_t kCNLane = 16;
static constexpr index_t kCM0PerLane = 1;
static constexpr index_t kCM1PerLane = 4;
// c_vec += a_vec * b_vec
template <bool post_nop_ = false>
CK_TILE_DEVICE void operator()(CVecType& c_vec,
const AVecType& a_vec,
const BVecType& b_vec,
bool_constant<post_nop_> = {}) const
{
DISPATCH_MFMA_CTRL_("v_mfma_i32_16x16x32_i8", Ctrl)
else
{
#if defined(__gfx94__)
c_vec = __builtin_amdgcn_mfma_i32_16x16x32i8(
bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
#elif defined(__gfx908__) || defined(__gfx90a__)
static_for<0, 8, 1>{}([&](auto k) {
float a_f32 =
type_convert<float>(reinterpret_cast<const thread_buffer<ADataType, 8>&>(a_vec)
.template get_as<ADataType>()[number<k>{}]);
float b_f32 =
type_convert<float>(reinterpret_cast<const thread_buffer<BDataType, 8>&>(b_vec)
.template get_as<BDataType>()[number<k>{}]);
c_vec = __builtin_amdgcn_mfma_f32_32x32x2f32(a_f32, b_f32, c_vec, 0, 0, 0);
});
#else
ck_tile::ignore = c_vec;
ck_tile::ignore = a_vec;
ck_tile::ignore = b_vec;
#endif
}
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
{
CVecType c_vec{0};
operator()(c_vec, a_vec, b_vec);
return c_vec;
}
};
#undef DISPATCH_MFMA_
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment