Commit 572865a6 authored by carlushuang's avatar carlushuang
Browse files

update first gemm ok

parent 9ec4e3f7
......@@ -207,8 +207,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
{(max_num_tokens_padded + block_m - 1) / block_m});
ck_tile::HostTensor<IndexDataType> num_sorted_tiles_host({1});
#if 1
#if 1
#if 0
# if 1
ck_tile::FillStepRange<ADataType>{-.5f, .5f, 0.01f}(a_host);
ck_tile::FillStepRange<GDataType>{-.5f, .5f, 0.01f}(g_host);
ck_tile::FillStepRange<DDataType, false>{.5f, -.5f, -0.01f}(d_host);
......@@ -217,7 +217,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile::FillStepRange<DScaleDataType>{0.f, 1.f, 0.01f}(sd_host);
ck_tile::FillStepRange<YSmoothScaleDataType>{0.f, 1.f, 0.01f}(sy_host);
ck_tile::FillStepRange<TopkWeightDataType>{-.5f, .5f, 0.01f}(topk_weight_host);
#else
# else
ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
ck_tile::FillUniformDistribution<GDataType>{-.5f, .5f}(g_host);
ck_tile::FillUniformDistribution<DDataType>{-.5f, .5f}(d_host);
......@@ -226,7 +226,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile::FillUniformDistribution<DScaleDataType>{-.5f, .5f}(sd_host);
ck_tile::FillUniformDistribution<YSmoothScaleDataType>{-.5f, .5f}(sy_host);
ck_tile::FillUniformDistribution<TopkWeightDataType>{-.5f, .5f}(topk_weight_host);
#endif
# endif
// permute weight
ck_tile::HostTensor<GDataType> g_perm_host = shuffle_moe_weight(g_host, prec_w, 1);
......@@ -266,6 +266,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile::HostTensor<DDataType> d_perm_host = shuffle_moe_weight(d_host, prec_w, 1);
std::cout << "------- @@@ " << __LINE__ << std::flush << std::endl;
# if 0
ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
topk_ids_host,
topk_weight_host,
......@@ -318,8 +319,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
}
return 1;
#endif
# endif
#endif
(void)balance;
ck_tile::reference_moe_sorting<TopkWeightDataType, IndexDataType>(
topk_ids_host,
topk_weight_host,
......
......@@ -432,23 +432,34 @@ struct tile_window_linear
CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number<i_access>)
{
constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
// since this is linear offset, we assum bottom X tensor is always linear
constexpr index_t linear_offset = [&]() {
constexpr auto x_idx_ = linear_coord;
constexpr auto x_len_ = TileDstr{}.get_lengths();
static_assert(x_idx_.size() == x_len_.size());
constexpr index_t x_dims_ = x_idx_.size();
index_t cu_stride_ = 1;
index_t cu_offset_ = 0;
static_for<0, x_dims_, 1>{}([&](auto i_) {
auto r_i_ = number<x_dims_ - i_ - 1>{};
cu_offset_ += x_idx_[r_i_] * cu_stride_;
cu_stride_ *= x_len_[r_i_];
});
return cu_offset_;
}();
return linear_offset;
constexpr auto is_pure_linear_tensor = reduce_on_sequence(LinearBottomDims{}, multiplies{}, number<1>{});
if constexpr (is_pure_linear_tensor) {
// this case usually is a LDS window, everything is build time know.
// we directly use BottomTensorView to compute the offset, in case there is any padding
auto bottom_tensor_coord = make_tensor_coordinate(
BottomTensorView{}.get_tensor_descriptor(), linear_coord);
return bottom_tensor_coord.get_offset();
} else {
// this case usually is a global window, where last dim can be linear
// we hack here, that use the original TileDstr to compute the linear offset
// ... hoping that there is no extra padding between other dims, which make sense
// since that sould introduce runtime length
constexpr index_t linear_offset = [&]() {
constexpr auto x_idx_ = linear_coord;
constexpr auto x_len_ = TileDstr{}.get_lengths();
static_assert(x_idx_.size() == x_len_.size());
constexpr index_t x_dims_ = x_idx_.size();
index_t cu_stride_ = 1;
index_t cu_offset_ = 0;
static_for<0, x_dims_, 1>{}([&](auto i_) {
auto r_i_ = number<x_dims_ - i_ - 1>{};
cu_offset_ += x_idx_[r_i_] * cu_stride_;
cu_stride_ *= x_len_[r_i_];
});
return cu_offset_;
}();
return linear_offset;
}
}
CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; }
......
......@@ -122,6 +122,7 @@ void reference_fused_moe(
type_convert<AccDataType>(g_host(i_expert, i_n, i_k));
}
acc_0(0, i_n) = acc;
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
}
ck_tile::HostTensor<AccDataType> y({1, intermediate_size_1});
......@@ -134,6 +135,7 @@ void reference_fused_moe(
for(ck_tile::index_t i_n = 0; i_n < intermediate_size_1; i_n++)
{
Activation{}(y(0, i_n), acc_0(0, i_n));
printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, y(0, i_n));
}
}
else
......@@ -168,7 +170,8 @@ void reference_fused_moe(
}
};
make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
// make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
make_ParallelTensorFunctor(f, max_num_tokens_padded)(1);
// reduce
auto r = [&](auto i_token) {
......
......@@ -596,9 +596,9 @@ struct FastGeluAsm
CK_TILE_DEVICE void operator()<float, float>(float& y, const float& x) const
{
// const float u = 2.f * x * (0.035677f * x * x + 0.797885f);
const float c1 = 0xbd92220c; // -2.0 * 0.035677f;
const float c2 = -2.0 * 0.797885f;
const float log2e_ = 0x3fb8aa3b; // log2e_v<float>;
const uint32_t c1 = 0xbd92220c; // -2.0 * 0.035677f;
const float c2 = -2.0 * 0.797885f;
const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v<float>;
float tmp;
asm volatile("v_mul_f32 %[v_tmp], %[v_x], %[v_x] ; x*x\n"
......@@ -606,13 +606,63 @@ struct FastGeluAsm
"v_mul_f32 %[v_tmp], %[v_tmp], %[v_x] ; x*(c1*x*x+c2)\n"
"v_mul_f32 %[v_tmp], %[v_tmp], %[s_log2e] ; log2e*x*(c1*x*x+c2)\n"
"v_exp_f32 %[v_tmp], %[v_tmp] ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
"s_nop 0 ; hazard for exp\n"
"v_add_f32 %[v_tmp], %[v_tmp], 1.0 ; emu+1.0f\n"
"v_rcp_f32 %[v_tmp], %[v_tmp] ; 1/(emu+1.0f)\n"
"s_nop 0 ; hazard for rcp \n"
"v_mul_f32 %[v_y], %[v_tmp], %[v_x] ; x * 1/(emu+1f)\n"
: [v_y] "=v"(y), [v_tmp] "+v"(tmp)
: [v_x] "v"(x), [s_c1] "s"(c1), [v_c2] "v"(c2), [s_log2e] "s"(log2e_)
:);
}
template <>
CK_TILE_HOST void operator()<fp32x2_t, fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
{
// const float u = -2.f * x * (0.035677f * x * x + 0.797885f);
const float c1 = -2.0 * 0.035677f;
const float c2 = -2.0 * 0.797885f;
const float u0 = x.x * (c1 * x.x * x.x + c2);
const float emu0 = exp(u0);
y.x = x.x / (1.f + emu0);
const float u1 = x.y * (c1 * x.y * x.y + c2);
const float emu1 = exp(u1);
y.y = x.y / (1.f + emu1);
}
// this is packed verion to remove data hazard for trans
template <>
CK_TILE_DEVICE void operator()<fp32x2_t, fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
{
// const float u = 2.f * x * (0.035677f * x * x + 0.797885f);
const uint32_t c1 = 0xbd92220c; // -2.0 * 0.035677f;
const float c2 = -2.0 * 0.797885f;
const uint32_t log2e_ = 0x3fb8aa3b; // log2e_v<float>;
float tmp0, tmp1;
float y0, y1;
asm volatile("v_mul_f32 %[v_tmp0], %[v_x0], %[v_x0] ; x*x\n"
"v_mul_f32 %[v_tmp1], %[v_x1], %[v_x1] ; x*x\n"
"v_fma_f32 %[v_tmp0], %[v_tmp0], %[s_c1], %[v_c2] ; c1*x*x+c2\n"
"v_fma_f32 %[v_tmp1], %[v_tmp1], %[s_c1], %[v_c2] ; c1*x*x+c2\n"
"v_mul_f32 %[v_tmp0], %[v_tmp0], %[v_x0] ; x*(c1*x*x+c2)\n"
"v_mul_f32 %[v_tmp1], %[v_tmp1], %[v_x1] ; x*(c1*x*x+c2)\n"
"v_mul_f32 %[v_tmp0], %[v_tmp0], %[s_log2e] ; log2e*x*(c1*x*x+c2)\n"
"v_mul_f32 %[v_tmp1], %[v_tmp1], %[s_log2e] ; log2e*x*(c1*x*x+c2)\n"
"v_exp_f32 %[v_tmp0], %[v_tmp0] ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
"v_exp_f32 %[v_tmp1], %[v_tmp1] ; emu = exp2(log2e*x*(c1*x*x+c2))\n"
"v_add_f32 %[v_tmp0], %[v_tmp0], 1.0 ; emu+1.0f\n"
"v_add_f32 %[v_tmp1], %[v_tmp1], 1.0 ; emu+1.0f\n"
"v_rcp_f32 %[v_tmp0], %[v_tmp0] ; 1/(emu+1.0f)\n"
"v_rcp_f32 %[v_tmp1], %[v_tmp1] ; 1/(emu+1.0f)\n"
"v_mul_f32 %[v_y0], %[v_tmp0], %[v_x0] ; x * 1/(emu+1f)\n"
"v_mul_f32 %[v_y1], %[v_tmp1], %[v_x1] ; x * 1/(emu+1f)\n"
: [v_y0] "=v"(y0), [v_y1] "=v"(y1), [v_tmp0] "+v"(tmp0), [v_tmp1] "+v"(tmp1)
: [v_x0] "v"(x.x), [v_x1] "v"(x.y), [s_c1] "s"(c1), [v_c2] "v"(c2), [s_log2e] "s"(log2e_)
:);
y.x = y0;
y.y = y1;
}
};
// https://paperswithcode.com/method/gelu
......
......@@ -262,168 +262,168 @@ struct FlatmmSnUK_GFX9_32x128x512_1x4x1_16x16x16_BF16
" s_waitcnt vmcnt(32) \n"
" s_barrier \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[16:17], v[128:129], 0 \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[18:19], v[130:131], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[20:21], v[132:133], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[22:23], v[134:135], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[24:25], v[136:137], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[26:27], v[138:139], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[28:29], v[140:141], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[30:31], v[142:143], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[16:17], v[192:193], 0 \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[18:19], v[194:195], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[20:21], v[196:197], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[22:23], v[198:199], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[24:25], v[200:201], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[26:27], v[202:203], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[28:29], v[204:205], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[30:31], v[206:207], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]] \n"
" s_waitcnt vmcnt(32) \n"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[32:33], v[144:145], [%[c0],%[c1],%[c2],%[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[34:35], v[146:147], [%[c0],%[c1],%[c2],%[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[36:37], v[148:149], [%[c0],%[c1],%[c2],%[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[38:39], v[150:151], [%[c0],%[c1],%[c2],%[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[40:41], v[152:153], [%[c0],%[c1],%[c2],%[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[42:43], v[154:155], [%[c0],%[c1],%[c2],%[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[44:45], v[156:157], [%[c0],%[c1],%[c2],%[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[46:47], v[158:159], [%[c0],%[c1],%[c2],%[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[32:33], v[208:209], [%[c4],%[c5],%[c6],%[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[34:35], v[210:211], [%[c4],%[c5],%[c6],%[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[36:37], v[212:213], [%[c4],%[c5],%[c6],%[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[38:39], v[214:215], [%[c4],%[c5],%[c6],%[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[40:41], v[216:217], [%[c4],%[c5],%[c6],%[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[42:43], v[218:219], [%[c4],%[c5],%[c6],%[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[44:45], v[220:221], [%[c4],%[c5],%[c6],%[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[46:47], v[222:223], [%[c4],%[c5],%[c6],%[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[48:49], v[144:145], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[50:51], v[146:147], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[52:53], v[148:149], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[54:55], v[150:151], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[56:57], v[152:153], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[58:59], v[154:155], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[60:61], v[156:157], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[62:63], v[158:159], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[48:49], v[208:209], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[50:51], v[210:211], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[52:53], v[212:213], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[54:55], v[214:215], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[56:57], v[216:217], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[58:59], v[218:219], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[60:61], v[220:221], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[62:63], v[222:223], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
" s_waitcnt vmcnt(32) \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[80:81], v[160:161], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[82:83], v[162:163], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[84:85], v[164:165], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[86:87], v[166:167], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[88:89], v[168:169], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[90:91], v[170:171], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[92:93], v[172:173], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[94:95], v[174:175], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[80:81], v[224:225], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[82:83], v[226:227], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[84:85], v[228:229], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[86:87], v[230:231], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[88:89], v[232:233], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[90:91], v[234:235], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[92:93], v[236:237], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[94:95], v[238:239], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]] \n"
" s_waitcnt vmcnt(32) \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] \n"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[112:113], v[176:177], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[114:115], v[178:179], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[116:117], v[180:181], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[118:119], v[182:183], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[120:121], v[184:185], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[122:123], v[186:187], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]] \n"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[124:125], v[188:189], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[126:127], v[190:191], [%[c8],%[c9],%[c10],%[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[112:113], v[240:241], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[114:115], v[242:243], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[116:117], v[244:245], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[118:119], v[246:247], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[120:121], v[248:249], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[122:123], v[250:251], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], %[c13], %[c14], %[c15]] \n"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[124:125], v[252:253], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[126:127], v[254:255], [%[c12],%[c13],%[c14],%[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], %[c13], %[c14], %[c15]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]\n"
// " s_add_u32 s60, 0x00000100, s80 \n"
// " s_cmp_lt_u32 s60, s81 \n"
// " s_cselect_b32 s56, s56, 0 \n"
......@@ -561,167 +561,168 @@ struct FlatmmSnUK_GFX9_32x128x512_1x4x1_16x16x16_BF16
" s_waitcnt vmcnt(32) \n"
" s_barrier \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0 \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0 \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0 \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0 \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]] \n"
" s_waitcnt vmcnt(32) \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]] \n"
" s_waitcnt vmcnt(32) \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]] \n"
" s_waitcnt vmcnt(32) \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]] \n"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]] \n"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]] \n"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]] \n"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]] \n"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]\n"
// " s_add_u32 s60, 0x00000100, s80 \n"
// " s_cmp_lt_u32 s60, s81 \n"
// " s_cselect_b32 s56, s56, 0 \n"
......
......@@ -13,6 +13,26 @@ namespace ck_tile {
// require 4 wave, occupancy=1c
// agpr useage:256
// vgpr usage:64(A local) + 64(acc) + 8(os_a) + 8(os_b) = 144 (rem:112)
//
// for this gemm, 4 16x16x16 transposed layout
// input A vpgpr layout
// v0-v15: [ 0:15](gemm_m)x128(gemm_k)
// v16-v31: [16:31](gemm_m)x128(gemm_k)
// input B vpgpr layout
// v0-v15: [ 0: 15](gemm_n)x128(gemm_k)
// v16-v31: [ 64: 79](gemm_n)x128(gemm_k)
// ......................
// v111-v127: [448:463](gemm_n)x128(gemm_k)
// output C vpgpr layout
// v0-v3 : [ 0:15](gemm_m)x[ 0: 15](gemm_n)
// v4-v7 : [16:31](gemm_m)x[ 0: 15](gemm_n)
// v8-v11: [ 0:15](gemm_m)x[64: 79](gemm_n)
// v12-v15: [16:31](gemm_m)x[64: 79](gemm_n)
// ......................
// v56-v59: [ 0:15](gemm_m)x[448:463](gemm_n)
// v60-v63: [16:31](gemm_m)x[448:463](gemm_n)
struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
{
static constexpr index_t Block_M = 32;
......@@ -42,7 +62,7 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
static constexpr index_t Repeat_N = Block_N / (Warp_N * WarpPerBlock_N); // 8
static constexpr index_t Repeat_K = Block_K / (Warp_K * WarpPerBlock_K); // 8/2=4
static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
static CK_TILE_DEVICE constexpr auto MakeCBlockDist()
{
constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
sequence<>,
......@@ -53,11 +73,17 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
sequence<0, 0>>{};
using WG = WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution;
using CDataType = float;
constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
return c_block_dstr;
}
static CK_TILE_DEVICE constexpr auto MakeCBlockTile()
{
using CDataType = float;
constexpr auto c_block_dstr = MakeCBlockDist();
auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
return c_block_tensor;
}
......@@ -153,21 +179,8 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
// template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeLdsLoadDesc_A()
{
// A async->LDS
// Note that, this descriptor is only to construct the layout inside LDS
// in real Gemm pipeline, ds_read may not follow this pattern
// (may follow that in tile_distribution)
// below code is almost the same as SmemStore dist, with difference:
// 1). modify the GuaranteedLastDimensionVectorLength of naive tensor desc
// 2). return discriptor is in NxK 2d layout
// constexpr index_t Block_M = Problem::BlockShape::Block_M0;
// constexpr index_t Block_K = Problem::BlockShape::Block_K0;
// constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
constexpr index_t warpSize = ck_tile::get_warp_size();
// constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
// load from LDS to register, every wave has same layout
constexpr index_t KPack_ = 8; // GetSmemKPack_A<Problem>(); // LDS
constexpr index_t KVector = 2; // GetAlignment_A<Problem>(); // async copy 1 dword
constexpr index_t KPad = KPack_; // pad between warps
constexpr index_t kAMLane = 16;
......@@ -176,29 +189,12 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
constexpr index_t kKIter = 2;
static_assert(KPack_ == (kABKPerLane * kKIter));
static_assert(Block_K % KVector == 0);
constexpr index_t LanesPerK = Block_K / KVector; // how many thread loading K
if constexpr(LanesPerK >= warpSize)
{
// need multiple waves to load K
static_assert(LanesPerK % warpSize == 0);
constexpr index_t wavesPerK = LanesPerK / warpSize;
if constexpr(wavesPerK >= NumWarps)
{
// TODO: need multiple issues along K to load all data
}
else
{
// TODO: every wave load the same data!
static_assert(Block_K % (kABKLane * KPack_) == 0);
constexpr index_t issue_along_k = Block_K / (kABKLane * KPack_); // 4
constexpr index_t issue_along_m = Block_M / (kAMLane); // 2
constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
make_tuple(number<issue_along_m>{}, // m0
number<kAMLane>{}, // m1
number<issue_along_k>{}, // k0
number<kABKLane>{}, // k1
number<KPack_>{}), // k2
constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
make_tuple(number<Repeat_M>{}, // m0 y
number<kAMLane>{}, // m1 p
number<Repeat_K>{}, // k0 y
number<kABKLane>{}, // k1 p
number<KPack_>{}), // k2 y-vector
make_tuple(number<kAMLane*(Block_K + KPad)>{}, // m0
number<Block_K + KPad>{}, // m1
number<kABKLane * KPack_>{}, // k0
......@@ -207,21 +203,16 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
number<KPack_>{}, // lds load vector
number<1>{});
constexpr auto lds_desc_m_k = transform_tensor_descriptor(
lds_block_desc_0,
make_tuple(make_merge_transform(
make_tuple(number<issue_along_m>{}, number<kAMLane>{})),
make_merge_transform(make_tuple(
number<issue_along_k>{}, number<kABKLane>{}, number<KPack_>{}))),
make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
make_tuple(sequence<0>{}, sequence<1>{}));
return lds_desc_m_k;
}
}
else
{
}
constexpr auto lds_desc_m_k = transform_tensor_descriptor(
lds_block_desc_0,
make_tuple(make_merge_transform(
make_tuple(number<Repeat_M>{}, number<kAMLane>{})),
make_merge_transform(make_tuple(
number<Repeat_K>{}, number<kABKLane>{}, number<KPack_>{}))),
make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
make_tuple(sequence<0>{}, sequence<1>{}));
return lds_desc_m_k;
}
static constexpr auto GetGemm_AWarpEnc()
......@@ -271,10 +262,10 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
auto a_sld = [&]() {
constexpr auto a_warp_enc_ = GetGemm_AWarpEnc();
constexpr auto a_outer_dstr_enc = tile_distribution_encoding<
sequence<>,
sequence<WarpPerBlock_N>,
tuple<sequence<Repeat_M, WarpPerBlock_M>, sequence<Repeat_K>>,
tuple<sequence<1>>,
tuple<sequence<1>>,
tuple<sequence<1, 0>>,
tuple<sequence<1, 0>>,
sequence<1, 2>,
sequence<0, 0>>{};
constexpr auto a_block_dstr_encode =
......@@ -300,6 +291,12 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
},
number<a_sld.get_num_of_access()>{});
printf("----- tid:%d, a_sld:%d\n", static_cast<index_t>(threadIdx.x),
static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset()));
index_t loop_cnt = k / Block_K;
// this is the acc thread buffer
......
......@@ -586,6 +586,47 @@ struct FusedMoeGemmPipelineFlatmmPolicy
return desc;
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeBridgeLdsStoreForUKDesc()
{
constexpr index_t WarpPerBlock_N = Problem::BlockShape::WarpPerBlock_N0;
constexpr index_t Repeat_N = Problem::BlockShape::Repeat_N0;
constexpr index_t Repeat_M = Problem::BlockShape::Repeat_M0;
constexpr index_t kAMLane = 16;
constexpr index_t kABKLane = 4;
constexpr index_t kABKPerLane = 4;
constexpr index_t KPack = kABKPerLane;
constexpr auto lds_block_desc_0 = make_naive_tensor_descriptor(
make_tuple(number<Repeat_M>{}, // m
number<Repeat_N>{}, // n
number<WarpPerBlock_N>{}, // n
number<kABKLane>{}, // n
number<kAMLane>{}, // m
number<KPack>{}), // n
make_tuple(number<Repeat_N * WarpPerBlock_N * kABKLane * kAMLane * KPack>{}, // m
number<WarpPerBlock_N * kABKLane * kAMLane * KPack>{}, // n
number<kABKLane * kAMLane * KPack>{}, // n
number<kAMLane * KPack>{}, // n
number<KPack>{}, // m
number<1>{}), // n
number<KPack>{}, // lds store vector(actually no explicit store)
number<1>{});
constexpr auto desc = transform_tensor_descriptor(
lds_block_desc_0,
make_tuple(
make_merge_transform(make_tuple(number<Repeat_M>{}, number<kAMLane>{})),
make_merge_transform(make_tuple(number<Repeat_N>{}, number<WarpPerBlock_N>{}, number<kABKLane>{}, number<KPack>{}))
),
make_tuple(sequence<0, 4>{}, sequence<1, 2, 3, 5>{}),
make_tuple(sequence<0>{}, sequence<1>{}));
return desc;
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemm0()
{
......
......@@ -340,12 +340,15 @@ struct FusedMoeGemmPipeline_FlatmmUk
number<row_ids_a.size()>{});
auto bridge_sst_win = [&]() {
return make_tile_window(
constexpr auto desc_ = Policy::template MakeBridgeLdsStoreForUKDesc<Problem>();
constexpr auto dist_ = Policy::template GetUK_0<Problem>().MakeCBlockDist();
return make_tile_window_linear(
make_tensor_view<address_space_enum::lds>(
reinterpret_cast<YDataType*>(smem),
Policy::template MakeBridgeLdsStoreDesc<Problem>()),
Policy::template MakeBridgeLdsStoreDesc<Problem>().get_lengths(),
{0, 0});
desc_),
desc_.get_lengths(),
{0, 0},
dist_);
}();
auto o_res =
make_wave_buffer_resource(reinterpret_cast<const ODataType*>(kargs.o_ptr),
......@@ -439,8 +442,56 @@ struct FusedMoeGemmPipeline_FlatmmUk
BlockShape::Block_W0); // tile offset for B matrix each unroll
// return ;
//sweep_tile(acc_0,
// [&](auto idx) { typename Problem::GateActivation{}(acc_0(idx), acc_0[idx]); });
sweep_tile(acc_0,
[&](auto idx) { typename Problem::GateActivation{}(acc_0(idx), acc_0[idx]); });
[&](auto idx0, auto idx1) {
fp32x2_t v_ {acc_0(idx0), acc_0(idx1)};
typename Problem::GateActivation{}(v_, v_);
acc_0(idx0) = v_.x;
acc_0(idx1) = v_.y;
},
sequence<1, 2>{});
#if 0
printf("bid:%d,%d, tid:%d, sorted_tile_id:%d(, intermediate_tile_id:%d, e:%d, "
"interm_idx_nr:%d, coords:a:%d,%d,%d, row_ids_a:%d,%d,%d, (%d)g_coords:%d.%d.%d, bridge_sst_win:%d"
"acc:%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f\n",
static_cast<int>(blockIdx.x),
static_cast<int>(blockIdx.y),
static_cast<int>(threadIdx.x),
sorted_tile_id,
intermediate_tile_id,
expert_id,
interm_idx_nr,
row_coords_a[0],
row_coords_a[1],
row_coords_a[7],
row_ids_a[0],
row_ids_a[1],
row_ids_a[7],
kr_0 * BlockShape::Block_W0,
g_coords[number<0>{}],
g_coords[number<1>{}],
g_coords[number<7>{}],
bridge_sst_win.cached_coords_[number<0>{}].get_offset(),
acc_0.get_thread_buffer()[number<0>{}],
acc_0.get_thread_buffer()[number<1>{}],
acc_0.get_thread_buffer()[number<2>{}],
acc_0.get_thread_buffer()[number<3>{}],
acc_0.get_thread_buffer()[number<4>{}],
acc_0.get_thread_buffer()[number<5>{}],
acc_0.get_thread_buffer()[number<6>{}],
acc_0.get_thread_buffer()[number<7>{}],
acc_0.get_thread_buffer()[number<8 + 0>{}],
acc_0.get_thread_buffer()[number<8 + 1>{}],
acc_0.get_thread_buffer()[number<8 + 2>{}],
acc_0.get_thread_buffer()[number<8 + 3>{}],
acc_0.get_thread_buffer()[number<8 + 4>{}],
acc_0.get_thread_buffer()[number<8 + 5>{}],
acc_0.get_thread_buffer()[number<8 + 6>{}],
acc_0.get_thread_buffer()[number<8 + 7>{}]);
#endif
auto y_pre = cast_tile<YDataType>(acc_0);
store_tile(bridge_sst_win, y_pre);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment