Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
572865a6
Commit
572865a6
authored
Nov 14, 2024
by
carlushuang
Browse files
update first gemm ok
parent
9ec4e3f7
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
371 additions
and
214 deletions
+371
-214
example/ck_tile/15_fused_moe/main.cpp
example/ck_tile/15_fused_moe/main.cpp
+8
-5
include/ck_tile/core/tensor/tile_window_linear.hpp
include/ck_tile/core/tensor/tile_window_linear.hpp
+28
-17
include/ck_tile/host/reference/reference_fused_moe.hpp
include/ck_tile/host/reference/reference_fused_moe.hpp
+4
-1
include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
.../ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+53
-3
include/ck_tile/ops/flatmm/pipeline/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.hpp
...peline/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.hpp
+127
-126
include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp
.../pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp
+54
-57
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
...sed_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
+41
-0
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
...s/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
+56
-5
No files found.
example/ck_tile/15_fused_moe/main.cpp
View file @
572865a6
...
...
@@ -207,8 +207,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
{(
max_num_tokens_padded
+
block_m
-
1
)
/
block_m
});
ck_tile
::
HostTensor
<
IndexDataType
>
num_sorted_tiles_host
({
1
});
#if
1
#if 1
#if
0
#
if 1
ck_tile::FillStepRange<ADataType>{-.5f, .5f, 0.01f}(a_host);
ck_tile::FillStepRange<GDataType>{-.5f, .5f, 0.01f}(g_host);
ck_tile::FillStepRange<DDataType, false>{.5f, -.5f, -0.01f}(d_host);
...
...
@@ -217,7 +217,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile::FillStepRange<DScaleDataType>{0.f, 1.f, 0.01f}(sd_host);
ck_tile::FillStepRange<YSmoothScaleDataType>{0.f, 1.f, 0.01f}(sy_host);
ck_tile::FillStepRange<TopkWeightDataType>{-.5f, .5f, 0.01f}(topk_weight_host);
#else
#
else
ck_tile
::
FillUniformDistribution
<
ADataType
>
{
-
.5
f
,
.5
f
}(
a_host
);
ck_tile
::
FillUniformDistribution
<
GDataType
>
{
-
.5
f
,
.5
f
}(
g_host
);
ck_tile
::
FillUniformDistribution
<
DDataType
>
{
-
.5
f
,
.5
f
}(
d_host
);
...
...
@@ -226,7 +226,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile
::
FillUniformDistribution
<
DScaleDataType
>
{
-
.5
f
,
.5
f
}(
sd_host
);
ck_tile
::
FillUniformDistribution
<
YSmoothScaleDataType
>
{
-
.5
f
,
.5
f
}(
sy_host
);
ck_tile
::
FillUniformDistribution
<
TopkWeightDataType
>
{
-
.5
f
,
.5
f
}(
topk_weight_host
);
#endif
#
endif
// permute weight
ck_tile
::
HostTensor
<
GDataType
>
g_perm_host
=
shuffle_moe_weight
(
g_host
,
prec_w
,
1
);
...
...
@@ -266,6 +266,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile
::
HostTensor
<
DDataType
>
d_perm_host
=
shuffle_moe_weight
(
d_host
,
prec_w
,
1
);
std
::
cout
<<
"------- @@@ "
<<
__LINE__
<<
std
::
flush
<<
std
::
endl
;
# if 0
ck_tile
::
reference_moe_sorting
<
TopkWeightDataType
,
IndexDataType
>
(
topk_ids_host
,
topk_weight_host
,
...
...
@@ -318,8 +319,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
}
return
1
;
#endif
#
endif
#endif
(
void
)
balance
;
ck_tile
::
reference_moe_sorting
<
TopkWeightDataType
,
IndexDataType
>
(
topk_ids_host
,
topk_weight_host
,
...
...
include/ck_tile/core/tensor/tile_window_linear.hpp
View file @
572865a6
...
...
@@ -432,23 +432,34 @@ struct tile_window_linear
CK_TILE_DEVICE
static
constexpr
index_t
get_bottom_linear_offset
(
number
<
i_access
>
)
{
constexpr
auto
linear_coord
=
get_bottom_linear_coordinate
(
number
<
i_access
>
{});
// since this is linear offset, we assum bottom X tensor is always linear
constexpr
index_t
linear_offset
=
[
&
]()
{
constexpr
auto
x_idx_
=
linear_coord
;
constexpr
auto
x_len_
=
TileDstr
{}.
get_lengths
();
static_assert
(
x_idx_
.
size
()
==
x_len_
.
size
());
constexpr
index_t
x_dims_
=
x_idx_
.
size
();
index_t
cu_stride_
=
1
;
index_t
cu_offset_
=
0
;
static_for
<
0
,
x_dims_
,
1
>
{}([
&
](
auto
i_
)
{
auto
r_i_
=
number
<
x_dims_
-
i_
-
1
>
{};
cu_offset_
+=
x_idx_
[
r_i_
]
*
cu_stride_
;
cu_stride_
*=
x_len_
[
r_i_
];
});
return
cu_offset_
;
}();
return
linear_offset
;
constexpr
auto
is_pure_linear_tensor
=
reduce_on_sequence
(
LinearBottomDims
{},
multiplies
{},
number
<
1
>
{});
if
constexpr
(
is_pure_linear_tensor
)
{
// this case usually is a LDS window, everything is build time know.
// we directly use BottomTensorView to compute the offset, in case there is any padding
auto
bottom_tensor_coord
=
make_tensor_coordinate
(
BottomTensorView
{}.
get_tensor_descriptor
(),
linear_coord
);
return
bottom_tensor_coord
.
get_offset
();
}
else
{
// this case usually is a global window, where last dim can be linear
// we hack here, that use the original TileDstr to compute the linear offset
// ... hoping that there is no extra padding between other dims, which make sense
// since that sould introduce runtime length
constexpr
index_t
linear_offset
=
[
&
]()
{
constexpr
auto
x_idx_
=
linear_coord
;
constexpr
auto
x_len_
=
TileDstr
{}.
get_lengths
();
static_assert
(
x_idx_
.
size
()
==
x_len_
.
size
());
constexpr
index_t
x_dims_
=
x_idx_
.
size
();
index_t
cu_stride_
=
1
;
index_t
cu_offset_
=
0
;
static_for
<
0
,
x_dims_
,
1
>
{}([
&
](
auto
i_
)
{
auto
r_i_
=
number
<
x_dims_
-
i_
-
1
>
{};
cu_offset_
+=
x_idx_
[
r_i_
]
*
cu_stride_
;
cu_stride_
*=
x_len_
[
r_i_
];
});
return
cu_offset_
;
}();
return
linear_offset
;
}
}
CK_TILE_DEVICE
constexpr
auto
get_num_of_access
()
const
{
return
traits
::
NumAccess
;
}
...
...
include/ck_tile/host/reference/reference_fused_moe.hpp
View file @
572865a6
...
...
@@ -122,6 +122,7 @@ void reference_fused_moe(
type_convert
<
AccDataType
>
(
g_host
(
i_expert
,
i_n
,
i_k
));
}
acc_0
(
0
,
i_n
)
=
acc
;
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
}
ck_tile
::
HostTensor
<
AccDataType
>
y
({
1
,
intermediate_size_1
});
...
...
@@ -134,6 +135,7 @@ void reference_fused_moe(
for
(
ck_tile
::
index_t
i_n
=
0
;
i_n
<
intermediate_size_1
;
i_n
++
)
{
Activation
{}(
y
(
0
,
i_n
),
acc_0
(
0
,
i_n
));
printf
(
"ie:%2d, it:%3d, in:%d, %f
\n
"
,
i_expert
,
i_token
,
i_n
,
y
(
0
,
i_n
));
}
}
else
...
...
@@ -168,7 +170,8 @@ void reference_fused_moe(
}
};
make_ParallelTensorFunctor
(
f
,
max_num_tokens_padded
)(
std
::
thread
::
hardware_concurrency
());
// make_ParallelTensorFunctor(f, max_num_tokens_padded)(std::thread::hardware_concurrency());
make_ParallelTensorFunctor
(
f
,
max_num_tokens_padded
)(
1
);
// reduce
auto
r
=
[
&
](
auto
i_token
)
{
...
...
include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
View file @
572865a6
...
...
@@ -596,9 +596,9 @@ struct FastGeluAsm
CK_TILE_DEVICE
void
operator
()
<
float
,
float
>
(
float
&
y
,
const
float
&
x
)
const
{
// const float u = 2.f * x * (0.035677f * x * x + 0.797885f);
const
floa
t
c1
=
0xbd92220c
;
// -2.0 * 0.035677f;
const
float
c2
=
-
2.0
*
0.797885
f
;
const
floa
t
log2e_
=
0x3fb8aa3b
;
// log2e_v<float>;
const
uint32_
t
c1
=
0xbd92220c
;
// -2.0 * 0.035677f;
const
float
c2
=
-
2.0
*
0.797885
f
;
const
uint32_
t
log2e_
=
0x3fb8aa3b
;
// log2e_v<float>;
float
tmp
;
asm
volatile
(
"v_mul_f32 %[v_tmp], %[v_x], %[v_x] ; x*x
\n
"
...
...
@@ -606,13 +606,63 @@ struct FastGeluAsm
"v_mul_f32 %[v_tmp], %[v_tmp], %[v_x] ; x*(c1*x*x+c2)
\n
"
"v_mul_f32 %[v_tmp], %[v_tmp], %[s_log2e] ; log2e*x*(c1*x*x+c2)
\n
"
"v_exp_f32 %[v_tmp], %[v_tmp] ; emu = exp2(log2e*x*(c1*x*x+c2))
\n
"
"s_nop 0 ; hazard for exp
\n
"
"v_add_f32 %[v_tmp], %[v_tmp], 1.0 ; emu+1.0f
\n
"
"v_rcp_f32 %[v_tmp], %[v_tmp] ; 1/(emu+1.0f)
\n
"
"s_nop 0 ; hazard for rcp
\n
"
"v_mul_f32 %[v_y], %[v_tmp], %[v_x] ; x * 1/(emu+1f)
\n
"
:
[
v_y
]
"=v"
(
y
),
[
v_tmp
]
"+v"
(
tmp
)
:
[
v_x
]
"v"
(
x
),
[
s_c1
]
"s"
(
c1
),
[
v_c2
]
"v"
(
c2
),
[
s_log2e
]
"s"
(
log2e_
)
:
);
}
template
<
>
CK_TILE_HOST
void
operator
()
<
fp32x2_t
,
fp32x2_t
>
(
fp32x2_t
&
y
,
const
fp32x2_t
&
x
)
const
{
// const float u = -2.f * x * (0.035677f * x * x + 0.797885f);
const
float
c1
=
-
2.0
*
0.035677
f
;
const
float
c2
=
-
2.0
*
0.797885
f
;
const
float
u0
=
x
.
x
*
(
c1
*
x
.
x
*
x
.
x
+
c2
);
const
float
emu0
=
exp
(
u0
);
y
.
x
=
x
.
x
/
(
1.
f
+
emu0
);
const
float
u1
=
x
.
y
*
(
c1
*
x
.
y
*
x
.
y
+
c2
);
const
float
emu1
=
exp
(
u1
);
y
.
y
=
x
.
y
/
(
1.
f
+
emu1
);
}
// this is packed verion to remove data hazard for trans
template
<
>
CK_TILE_DEVICE
void
operator
()
<
fp32x2_t
,
fp32x2_t
>
(
fp32x2_t
&
y
,
const
fp32x2_t
&
x
)
const
{
// const float u = 2.f * x * (0.035677f * x * x + 0.797885f);
const
uint32_t
c1
=
0xbd92220c
;
// -2.0 * 0.035677f;
const
float
c2
=
-
2.0
*
0.797885
f
;
const
uint32_t
log2e_
=
0x3fb8aa3b
;
// log2e_v<float>;
float
tmp0
,
tmp1
;
float
y0
,
y1
;
asm
volatile
(
"v_mul_f32 %[v_tmp0], %[v_x0], %[v_x0] ; x*x
\n
"
"v_mul_f32 %[v_tmp1], %[v_x1], %[v_x1] ; x*x
\n
"
"v_fma_f32 %[v_tmp0], %[v_tmp0], %[s_c1], %[v_c2] ; c1*x*x+c2
\n
"
"v_fma_f32 %[v_tmp1], %[v_tmp1], %[s_c1], %[v_c2] ; c1*x*x+c2
\n
"
"v_mul_f32 %[v_tmp0], %[v_tmp0], %[v_x0] ; x*(c1*x*x+c2)
\n
"
"v_mul_f32 %[v_tmp1], %[v_tmp1], %[v_x1] ; x*(c1*x*x+c2)
\n
"
"v_mul_f32 %[v_tmp0], %[v_tmp0], %[s_log2e] ; log2e*x*(c1*x*x+c2)
\n
"
"v_mul_f32 %[v_tmp1], %[v_tmp1], %[s_log2e] ; log2e*x*(c1*x*x+c2)
\n
"
"v_exp_f32 %[v_tmp0], %[v_tmp0] ; emu = exp2(log2e*x*(c1*x*x+c2))
\n
"
"v_exp_f32 %[v_tmp1], %[v_tmp1] ; emu = exp2(log2e*x*(c1*x*x+c2))
\n
"
"v_add_f32 %[v_tmp0], %[v_tmp0], 1.0 ; emu+1.0f
\n
"
"v_add_f32 %[v_tmp1], %[v_tmp1], 1.0 ; emu+1.0f
\n
"
"v_rcp_f32 %[v_tmp0], %[v_tmp0] ; 1/(emu+1.0f)
\n
"
"v_rcp_f32 %[v_tmp1], %[v_tmp1] ; 1/(emu+1.0f)
\n
"
"v_mul_f32 %[v_y0], %[v_tmp0], %[v_x0] ; x * 1/(emu+1f)
\n
"
"v_mul_f32 %[v_y1], %[v_tmp1], %[v_x1] ; x * 1/(emu+1f)
\n
"
:
[
v_y0
]
"=v"
(
y0
),
[
v_y1
]
"=v"
(
y1
),
[
v_tmp0
]
"+v"
(
tmp0
),
[
v_tmp1
]
"+v"
(
tmp1
)
:
[
v_x0
]
"v"
(
x
.
x
),
[
v_x1
]
"v"
(
x
.
y
),
[
s_c1
]
"s"
(
c1
),
[
v_c2
]
"v"
(
c2
),
[
s_log2e
]
"s"
(
log2e_
)
:
);
y
.
x
=
y0
;
y
.
y
=
y1
;
}
};
// https://paperswithcode.com/method/gelu
...
...
include/ck_tile/ops/flatmm/pipeline/uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.hpp
View file @
572865a6
...
...
@@ -262,168 +262,168 @@ struct FlatmmSnUK_GFX9_32x128x512_1x4x1_16x16x16_BF16
" s_waitcnt vmcnt(32)
\n
"
" s_barrier
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[16:17], v[128:129], 0
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[18:19], v[130:131], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[
20:21
], v[13
2
:13
3
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[2
2
:2
3
], v[13
4
:13
5
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[2
4
:2
5
], v[13
6
:13
7
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[2
6
:2
7
], v[13
8
:13
9
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[
18:19
], v[13
0
:13
1
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[2
0
:2
1
], v[13
2
:13
3
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[2
2
:2
3
], v[13
4
:13
5
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[2
4
:2
5
], v[13
6
:13
7
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[2
8
:2
9
], v[1
40:141
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[
30:31
], v[14
2
:14
3
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
12
],%[c
13
],%[c1
4
],%[c1
5
]], acc[
16:17
], v[1
9
2:1
9
3],
0
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[1
8
:1
9
], v[19
4
:19
5
],
[%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[2
6
:2
7
], v[1
38:139
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[
28:29
], v[14
0
:14
1
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
8
],
%[c
9
],
%[c1
0
],
%[c1
1
]], acc[
30:31
], v[1
4
2:1
4
3],
[%[c8], %[c9], %[c10], %[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[1
6
:1
7
], v[19
2
:19
3
],
0
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[
20:21
], v[19
6
:19
7
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[2
2
:2
3
], v[19
8
:19
9
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[2
4
:2
5
], v[
200:201
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[2
6
:2
7
], v[20
2
:20
3
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[
18:19
], v[19
4
:19
5
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[2
0
:2
1
], v[19
6
:19
7
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[2
2
:2
3
], v[
198:199
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[2
4
:2
5
], v[20
0
:20
1
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[28:29], v[204:205], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[30:31], v[206:207], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[30:31], v[206:207], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[32:33], v[144:145], [%[c0],%[c1],%[c2],%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[34:35], v[146:147], [%[c0],%[c1],%[c2],%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[3
6
:3
7
], v[14
8
:14
9
], [%[c0],%[c1],%[c2],%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[3
8
:3
9
], v[1
50:151
], [%[c0],%[c1],%[c2],%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[
40:41
], v[15
2
:15
3
], [%[c0],%[c1],%[c2],%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[4
2
:4
3
], v[15
4
:15
5
], [%[c0],%[c1],%[c2],%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],
%[c1],
%[c2],
%[c3]], acc[3
4
:3
5
], v[14
6
:14
7
], [%[c0],
%[c1],
%[c2],
%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],
%[c1],
%[c2],
%[c3]], acc[3
6
:3
7
], v[1
48:149
], [%[c0],
%[c1],
%[c2],
%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],
%[c1],
%[c2],
%[c3]], acc[
38:39
], v[15
0
:15
1
], [%[c0],
%[c1],
%[c2],
%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],
%[c1],
%[c2],
%[c3]], acc[4
0
:4
1
], v[15
2
:15
3
], [%[c0],
%[c1],
%[c2],
%[c3]]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[4
4
:4
5
], v[15
6
:15
7
], [%[c0],%[c1],%[c2],%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],%[c1],%[c2],%[c3]], acc[4
6
:4
7
], v[15
8
:15
9
], [%[c0],%[c1],%[c2],%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
4
],%[c
5
],%[c
6
],%[c
7
]], acc[
32:33], v[208:20
9], [%[c
4
],%[c
5
],%[c
6
],%[c
7
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[3
4
:3
5
], v[2
10:211
], [%[c4],%[c5],%[c6],%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],
%[c1],
%[c2],
%[c3]], acc[4
2
:4
3
], v[15
4
:15
5
], [%[c0],
%[c1],
%[c2],
%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0],
%[c1],
%[c2],
%[c3]], acc[4
4
:4
5
], v[15
6
:15
7
], [%[c0],
%[c1],
%[c2],
%[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
0
],
%[c
1
],
%[c
2
],
%[c
3
]], acc[
46:47], v[158:15
9], [%[c
0
],
%[c
1
],
%[c
2
],
%[c
3
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],
%[c5],
%[c6],
%[c7]], acc[3
2
:3
3
], v[2
08:209
], [%[c4],
%[c5],
%[c6],
%[c7]]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[3
6
:3
7
], v[21
2
:21
3
], [%[c4],%[c5],%[c6],%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[3
8
:3
9
], v[21
4
:21
5
], [%[c4],%[c5],%[c6],%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[
40:41
], v[21
6
:21
7
], [%[c4],%[c5],%[c6],%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[4
2
:4
3
], v[21
8
:21
9
], [%[c4],%[c5],%[c6],%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],
%[c5],
%[c6],
%[c7]], acc[3
4
:3
5
], v[21
0
:21
1
], [%[c4],
%[c5],
%[c6],
%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],
%[c5],
%[c6],
%[c7]], acc[3
6
:3
7
], v[21
2
:21
3
], [%[c4],
%[c5],
%[c6],
%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],
%[c5],
%[c6],
%[c7]], acc[
38:39
], v[21
4
:21
5
], [%[c4],
%[c5],
%[c6],
%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],
%[c5],
%[c6],
%[c7]], acc[4
0
:4
1
], v[21
6
:21
7
], [%[c4],
%[c5],
%[c6],
%[c7]]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[4
4
:4
5
], v[2
20:2
21], [%[c4],%[c5],%[c6],%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],%[c5],%[c6],%[c7]], acc[4
6
:4
7
], v[22
2
:22
3
], [%[c4],%[c5],%[c6],%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
8
],%[c
9
],%[c
10
],%[c
11
]], acc[4
8
:4
9
], v[
144:145
], [%[c
8
],%[c
9
],%[c
10
],%[c
11
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[
50:51
], v[14
6
:14
7
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],
%[c5],
%[c6],
%[c7]], acc[4
2
:4
3
], v[2
18:
21
9
], [%[c4],
%[c5],
%[c6],
%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4],
%[c5],
%[c6],
%[c7]], acc[4
4
:4
5
], v[22
0
:22
1
], [%[c4],
%[c5],
%[c6],
%[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
4
],
%[c
5
],
%[c
6
],
%[c
7
]], acc[4
6
:4
7
], v[
222:223
], [%[c
4
],
%[c
5
],
%[c
6
],
%[c
7
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[
48:49
], v[14
4
:14
5
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[5
2
:5
3
], v[14
8
:14
9
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[5
4
:5
5
], v[1
50:151
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[5
6
:5
7
], v[15
2
:15
3
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[5
8
:5
9
], v[15
4
:15
5
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[5
0
:5
1
], v[14
6
:14
7
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[5
2
:5
3
], v[1
48:149
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[5
4
:5
5
], v[15
0
:15
1
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[5
6
:5
7
], v[15
2
:15
3
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[
60:61
], v[15
6
:15
7
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[6
2
:6
3
], v[15
8
:15
9
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
12
],%[c
13
],%[c1
4
],%[c1
5
]], acc[
48:49], v[208:20
9], [%[c
12
],%[c
13
],%[c1
4
],%[c1
5
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[
50:51
], v[2
10:211
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[
58:59
], v[15
4
:15
5
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[6
0
:6
1
], v[15
6
:15
7
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
8
],
%[c
9
],
%[c1
0
],
%[c1
1
]], acc[
62:63], v[158:15
9], [%[c
8
],
%[c
9
],
%[c1
0
],
%[c1
1
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[
48:49
], v[2
08:209
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[5
2
:5
3
], v[21
2
:21
3
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[5
4
:5
5
], v[21
4
:21
5
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[5
6
:5
7
], v[21
6
:21
7
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[5
8
:5
9
], v[21
8
:21
9
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[5
0
:5
1
], v[21
0
:21
1
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[5
2
:5
3
], v[21
2
:21
3
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[5
4
:5
5
], v[21
4
:21
5
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[5
6
:5
7
], v[21
6
:21
7
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[60:61], v[220:221], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[62:63], v[222:223], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[62:63], v[222:223], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[80:81], v[160:161], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[82:83], v[162:163], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[8
4
:8
5
], v[16
4
:16
5
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[8
6
:8
7
], v[16
6
:16
7
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[8
8
:8
9
], v[16
8
:16
9
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[
90:91
], v[1
70:171
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[8
2
:8
3
], v[16
2
:16
3
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[8
4
:8
5
], v[16
4
:16
5
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[8
6
:8
7
], v[16
6
:16
7
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[
88:89
], v[1
68:169
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[9
2
:9
3
], v[17
2
:17
3
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[9
4
:9
5
], v[17
4
:17
5
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
12
],%[c
13
],%[c1
4
],%[c1
5
]], acc[
80:81], v[224:22
5], [%[c
12
],%[c
13
],%[c1
4
],%[c1
5
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[8
2
:8
3
], v[22
6
:22
7
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[9
0
:9
1
], v[17
0
:17
1
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[9
2
:9
3
], v[17
2
:17
3
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
8
],
%[c
9
],
%[c1
0
],
%[c1
1
]], acc[
94:95], v[174:17
5], [%[c
8
],
%[c
9
],
%[c1
0
],
%[c1
1
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[8
0
:8
1
], v[22
4
:22
5
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[8
4
:8
5
], v[22
8
:22
9
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[8
6
:8
7
], v[2
30:231
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[8
8
:8
9
], v[23
2
:23
3
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[
90:91
], v[23
4
:23
5
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[8
2
:8
3
], v[22
6
:22
7
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[8
4
:8
5
], v[2
28:229
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[8
6
:8
7
], v[23
0
:23
1
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[
88:89
], v[23
2
:23
3
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[92:93], v[236:237], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[94:95], v[238:239], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[94:95], v[238:239], [%[c12], %[c13], %[c14], %[c15]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[112:113], v[176:177], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[114:115], v[178:179], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[11
6
:11
7
], v[1
80:181
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[11
8
:11
9
], v[18
2
:18
3
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[1
20:121
], v[18
4
:18
5
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[12
2
:12
3
], v[18
6
:18
7
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[11
4
:11
5
], v[1
78:179
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[11
6
:11
7
], v[18
0
:18
1
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[1
18:119
], v[18
2
:18
3
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[12
0
:12
1
], v[18
4
:18
5
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[12
4
:12
5
], v[18
8
:18
9
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],%[c9],%[c10],%[c11]], acc[12
6
:12
7
], v[1
90:191
], [%[c8],%[c9],%[c10],%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
12
],%[c
13
],%[c1
4
],%[c1
5
]], acc[1
12:113], v[240:24
1], [%[c
12
],%[c
13
],%[c1
4
],%[c1
5
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[11
4
:11
5
], v[24
2
:24
3
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[12
2
:12
3
], v[18
6
:18
7
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c8],
%[c9],
%[c10],
%[c11]], acc[12
4
:12
5
], v[1
88:189
], [%[c8],
%[c9],
%[c10],
%[c11]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c
8
],
%[c
9
],
%[c1
0
],
%[c1
1
]], acc[1
26:127], v[190:19
1], [%[c
8
],
%[c
9
],
%[c1
0
],
%[c1
1
]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[11
2
:11
3
], v[24
0
:24
1
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[11
6
:11
7
], v[24
4
:24
5
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[11
8
:11
9
], v[24
6
:24
7
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[1
20:121
], v[24
8
:24
9
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[12
2
:12
3
], v[2
50:251
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[11
4
:11
5
], v[24
2
:24
3
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[11
6
:11
7
], v[24
4
:24
5
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[1
18:119
], v[24
6
:24
7
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[12
0
:12
1
], v[2
48:249
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[12
4
:12
5
], v[25
2
:25
3
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],%[c13],%[c14],%[c15]], acc[12
6
:12
7
], v[25
4
:25
5
], [%[c12],%[c13],%[c14],%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[12
2
:12
3
], v[25
0
:25
1
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12],
%[c13],
%[c14],
%[c15]], acc[12
4
:12
5
], v[25
2
:25
3
], [%[c12],
%[c13],
%[c14],
%[c15]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], %[c15]]
\n
"
// " s_add_u32 s60, 0x00000100, s80 \n"
// " s_cmp_lt_u32 s60, s81 \n"
// " s_cselect_b32 s56, s56, 0 \n"
...
...
@@ -561,167 +561,168 @@ struct FlatmmSnUK_GFX9_32x128x512_1x4x1_16x16x16_BF16
" s_waitcnt vmcnt(32)
\n
"
" s_barrier
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[130:131], v[130:131], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[146:147], v[130:131], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[162:163], v[146:147], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[178:179], v[146:147], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[194:195], v[162:163], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[210:211], v[162:163], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[226:227], v[178:179], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], [%[c16],%[c17],%[c18],%[c19]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], [%[c20],%[c21],%[c22],%[c23]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[242:243], v[178:179], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], [%[c24],%[c25],%[c26],%[c27]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]
\n
"
" v_mfma_f32_16x16x16_bf16 [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], [%[c28],%[c29],%[c30],%[c31]]
\n
"
// " s_add_u32 s60, 0x00000100, s80 \n"
// " s_cmp_lt_u32 s60, s81 \n"
// " s_cselect_b32 s56, s56, 0 \n"
...
...
include/ck_tile/ops/flatmm/pipeline/uk/flatmm_uk_gfx9_32x512x128_1x4x1_16x16x16.hpp
View file @
572865a6
...
...
@@ -13,6 +13,26 @@ namespace ck_tile {
// require 4 wave, occupancy=1c
// agpr useage:256
// vgpr usage:64(A local) + 64(acc) + 8(os_a) + 8(os_b) = 144 (rem:112)
//
// for this gemm, 4 16x16x16 transposed layout
// input A vpgpr layout
// v0-v15: [ 0:15](gemm_m)x128(gemm_k)
// v16-v31: [16:31](gemm_m)x128(gemm_k)
// input B vpgpr layout
// v0-v15: [ 0: 15](gemm_n)x128(gemm_k)
// v16-v31: [ 64: 79](gemm_n)x128(gemm_k)
// ......................
// v111-v127: [448:463](gemm_n)x128(gemm_k)
// output C vpgpr layout
// v0-v3 : [ 0:15](gemm_m)x[ 0: 15](gemm_n)
// v4-v7 : [16:31](gemm_m)x[ 0: 15](gemm_n)
// v8-v11: [ 0:15](gemm_m)x[64: 79](gemm_n)
// v12-v15: [16:31](gemm_m)x[64: 79](gemm_n)
// ......................
// v56-v59: [ 0:15](gemm_m)x[448:463](gemm_n)
// v60-v63: [16:31](gemm_m)x[448:463](gemm_n)
struct
FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
{
static
constexpr
index_t
Block_M
=
32
;
...
...
@@ -42,7 +62,7 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
static
constexpr
index_t
Repeat_N
=
Block_N
/
(
Warp_N
*
WarpPerBlock_N
);
// 8
static
constexpr
index_t
Repeat_K
=
Block_K
/
(
Warp_K
*
WarpPerBlock_K
);
// 8/2=4
static
CK_TILE_DEVICE
constexpr
auto
MakeCBlock
Tile
()
static
CK_TILE_DEVICE
constexpr
auto
MakeCBlock
Dist
()
{
constexpr
auto
c_block_outer_dstr_encoding
=
tile_distribution_encoding
<
sequence
<>
,
...
...
@@ -53,11 +73,17 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
sequence
<
0
,
0
>>
{};
using
WG
=
WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution
;
using
CDataType
=
float
;
constexpr
auto
c_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
c_block_outer_dstr_encoding
,
typename
WG
::
CWarpDstrEncoding
{});
constexpr
auto
c_block_dstr
=
make_static_tile_distribution
(
c_block_dstr_encode
);
return
c_block_dstr
;
}
static
CK_TILE_DEVICE
constexpr
auto
MakeCBlockTile
()
{
using
CDataType
=
float
;
constexpr
auto
c_block_dstr
=
MakeCBlockDist
();
auto
c_block_tensor
=
make_static_distributed_tensor
<
CDataType
>
(
c_block_dstr
);
return
c_block_tensor
;
}
...
...
@@ -153,21 +179,8 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
// template <typename Problem>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeLdsLoadDesc_A
()
{
// A async->LDS
// Note that, this descriptor is only to construct the layout inside LDS
// in real Gemm pipeline, ds_read may not follow this pattern
// (may follow that in tile_distribution)
// below code is almost the same as SmemStore dist, with difference:
// 1). modify the GuaranteedLastDimensionVectorLength of naive tensor desc
// 2). return discriptor is in NxK 2d layout
// constexpr index_t Block_M = Problem::BlockShape::Block_M0;
// constexpr index_t Block_K = Problem::BlockShape::Block_K0;
// constexpr index_t BlockSize = Problem::BlockShape::BlockSize;
constexpr
index_t
warpSize
=
ck_tile
::
get_warp_size
();
// constexpr index_t NumWarps = Problem::BlockShape::NumWarps;
// load from LDS to register, every wave has same layout
constexpr
index_t
KPack_
=
8
;
// GetSmemKPack_A<Problem>(); // LDS
constexpr
index_t
KVector
=
2
;
// GetAlignment_A<Problem>(); // async copy 1 dword
constexpr
index_t
KPad
=
KPack_
;
// pad between warps
constexpr
index_t
kAMLane
=
16
;
...
...
@@ -176,29 +189,12 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
constexpr
index_t
kKIter
=
2
;
static_assert
(
KPack_
==
(
kABKPerLane
*
kKIter
));
static_assert
(
Block_K
%
KVector
==
0
);
constexpr
index_t
LanesPerK
=
Block_K
/
KVector
;
// how many thread loading K
if
constexpr
(
LanesPerK
>=
warpSize
)
{
// need multiple waves to load K
static_assert
(
LanesPerK
%
warpSize
==
0
);
constexpr
index_t
wavesPerK
=
LanesPerK
/
warpSize
;
if
constexpr
(
wavesPerK
>=
NumWarps
)
{
// TODO: need multiple issues along K to load all data
}
else
{
// TODO: every wave load the same data!
static_assert
(
Block_K
%
(
kABKLane
*
KPack_
)
==
0
);
constexpr
index_t
issue_along_k
=
Block_K
/
(
kABKLane
*
KPack_
);
// 4
constexpr
index_t
issue_along_m
=
Block_M
/
(
kAMLane
);
// 2
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
issue_along_m
>
{},
// m0
number
<
kAMLane
>
{},
// m1
number
<
issue_along_k
>
{},
// k0
number
<
kABKLane
>
{},
// k1
number
<
KPack_
>
{}),
// k2
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
Repeat_M
>
{},
// m0 y
number
<
kAMLane
>
{},
// m1 p
number
<
Repeat_K
>
{},
// k0 y
number
<
kABKLane
>
{},
// k1 p
number
<
KPack_
>
{}),
// k2 y-vector
make_tuple
(
number
<
kAMLane
*
(
Block_K
+
KPad
)
>
{},
// m0
number
<
Block_K
+
KPad
>
{},
// m1
number
<
kABKLane
*
KPack_
>
{},
// k0
...
...
@@ -207,21 +203,16 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
number
<
KPack_
>
{},
// lds load vector
number
<
1
>
{});
constexpr
auto
lds_desc_m_k
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_merge_transform
(
make_tuple
(
number
<
issue_along_m
>
{},
number
<
kAMLane
>
{})),
make_merge_transform
(
make_tuple
(
number
<
issue_along_k
>
{},
number
<
kABKLane
>
{},
number
<
KPack_
>
{}))),
make_tuple
(
sequence
<
0
,
1
>
{},
sequence
<
2
,
3
,
4
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
return
lds_desc_m_k
;
}
}
else
{
}
constexpr
auto
lds_desc_m_k
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_merge_transform
(
make_tuple
(
number
<
Repeat_M
>
{},
number
<
kAMLane
>
{})),
make_merge_transform
(
make_tuple
(
number
<
Repeat_K
>
{},
number
<
kABKLane
>
{},
number
<
KPack_
>
{}))),
make_tuple
(
sequence
<
0
,
1
>
{},
sequence
<
2
,
3
,
4
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
return
lds_desc_m_k
;
}
static
constexpr
auto
GetGemm_AWarpEnc
()
...
...
@@ -271,10 +262,10 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
auto
a_sld
=
[
&
]()
{
constexpr
auto
a_warp_enc_
=
GetGemm_AWarpEnc
();
constexpr
auto
a_outer_dstr_enc
=
tile_distribution_encoding
<
sequence
<>
,
sequence
<
WarpPerBlock_N
>
,
tuple
<
sequence
<
Repeat_M
,
WarpPerBlock_M
>
,
sequence
<
Repeat_K
>>
,
tuple
<
sequence
<
1
>>
,
tuple
<
sequence
<
1
>>
,
tuple
<
sequence
<
1
,
0
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
0
>>
{};
constexpr
auto
a_block_dstr_encode
=
...
...
@@ -300,6 +291,12 @@ struct FlatmmUK_GFX9_32x512x128_1x4x1_16x16x16_BF16
},
number
<
a_sld
.
get_num_of_access
()
>
{});
printf
(
"----- tid:%d, a_sld:%d
\n
"
,
static_cast
<
index_t
>
(
threadIdx
.
x
),
static_cast
<
index_t
>
(
a_sld
.
cached_coords_
[
number
<
0
>
{}].
get_offset
()));
index_t
loop_cnt
=
k
/
Block_K
;
// this is the acc thread buffer
...
...
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp
View file @
572865a6
...
...
@@ -586,6 +586,47 @@ struct FusedMoeGemmPipelineFlatmmPolicy
return
desc
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeBridgeLdsStoreForUKDesc
()
{
constexpr
index_t
WarpPerBlock_N
=
Problem
::
BlockShape
::
WarpPerBlock_N0
;
constexpr
index_t
Repeat_N
=
Problem
::
BlockShape
::
Repeat_N0
;
constexpr
index_t
Repeat_M
=
Problem
::
BlockShape
::
Repeat_M0
;
constexpr
index_t
kAMLane
=
16
;
constexpr
index_t
kABKLane
=
4
;
constexpr
index_t
kABKPerLane
=
4
;
constexpr
index_t
KPack
=
kABKPerLane
;
constexpr
auto
lds_block_desc_0
=
make_naive_tensor_descriptor
(
make_tuple
(
number
<
Repeat_M
>
{},
// m
number
<
Repeat_N
>
{},
// n
number
<
WarpPerBlock_N
>
{},
// n
number
<
kABKLane
>
{},
// n
number
<
kAMLane
>
{},
// m
number
<
KPack
>
{}),
// n
make_tuple
(
number
<
Repeat_N
*
WarpPerBlock_N
*
kABKLane
*
kAMLane
*
KPack
>
{},
// m
number
<
WarpPerBlock_N
*
kABKLane
*
kAMLane
*
KPack
>
{},
// n
number
<
kABKLane
*
kAMLane
*
KPack
>
{},
// n
number
<
kAMLane
*
KPack
>
{},
// n
number
<
KPack
>
{},
// m
number
<
1
>
{}),
// n
number
<
KPack
>
{},
// lds store vector(actually no explicit store)
number
<
1
>
{});
constexpr
auto
desc
=
transform_tensor_descriptor
(
lds_block_desc_0
,
make_tuple
(
make_merge_transform
(
make_tuple
(
number
<
Repeat_M
>
{},
number
<
kAMLane
>
{})),
make_merge_transform
(
make_tuple
(
number
<
Repeat_N
>
{},
number
<
WarpPerBlock_N
>
{},
number
<
kABKLane
>
{},
number
<
KPack
>
{}))
),
make_tuple
(
sequence
<
0
,
4
>
{},
sequence
<
1
,
2
,
3
,
5
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{}));
return
desc
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetWarpGemm0
()
{
...
...
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
View file @
572865a6
...
...
@@ -340,12 +340,15 @@ struct FusedMoeGemmPipeline_FlatmmUk
number
<
row_ids_a
.
size
()
>
{});
auto
bridge_sst_win
=
[
&
]()
{
return
make_tile_window
(
constexpr
auto
desc_
=
Policy
::
template
MakeBridgeLdsStoreForUKDesc
<
Problem
>();
constexpr
auto
dist_
=
Policy
::
template
GetUK_0
<
Problem
>().
MakeCBlockDist
();
return
make_tile_window_linear
(
make_tensor_view
<
address_space_enum
::
lds
>
(
reinterpret_cast
<
YDataType
*>
(
smem
),
Policy
::
template
MakeBridgeLdsStoreDesc
<
Problem
>()),
Policy
::
template
MakeBridgeLdsStoreDesc
<
Problem
>().
get_lengths
(),
{
0
,
0
});
desc_
),
desc_
.
get_lengths
(),
{
0
,
0
},
dist_
);
}();
auto
o_res
=
make_wave_buffer_resource
(
reinterpret_cast
<
const
ODataType
*>
(
kargs
.
o_ptr
),
...
...
@@ -439,8 +442,56 @@ struct FusedMoeGemmPipeline_FlatmmUk
BlockShape
::
Block_W0
);
// tile offset for B matrix each unroll
// return ;
//sweep_tile(acc_0,
// [&](auto idx) { typename Problem::GateActivation{}(acc_0(idx), acc_0[idx]); });
sweep_tile
(
acc_0
,
[
&
](
auto
idx
)
{
typename
Problem
::
GateActivation
{}(
acc_0
(
idx
),
acc_0
[
idx
]);
});
[
&
](
auto
idx0
,
auto
idx1
)
{
fp32x2_t
v_
{
acc_0
(
idx0
),
acc_0
(
idx1
)};
typename
Problem
::
GateActivation
{}(
v_
,
v_
);
acc_0
(
idx0
)
=
v_
.
x
;
acc_0
(
idx1
)
=
v_
.
y
;
},
sequence
<
1
,
2
>
{});
#if 0
printf("bid:%d,%d, tid:%d, sorted_tile_id:%d(, intermediate_tile_id:%d, e:%d, "
"interm_idx_nr:%d, coords:a:%d,%d,%d, row_ids_a:%d,%d,%d, (%d)g_coords:%d.%d.%d, bridge_sst_win:%d"
"acc:%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f\n",
static_cast<int>(blockIdx.x),
static_cast<int>(blockIdx.y),
static_cast<int>(threadIdx.x),
sorted_tile_id,
intermediate_tile_id,
expert_id,
interm_idx_nr,
row_coords_a[0],
row_coords_a[1],
row_coords_a[7],
row_ids_a[0],
row_ids_a[1],
row_ids_a[7],
kr_0 * BlockShape::Block_W0,
g_coords[number<0>{}],
g_coords[number<1>{}],
g_coords[number<7>{}],
bridge_sst_win.cached_coords_[number<0>{}].get_offset(),
acc_0.get_thread_buffer()[number<0>{}],
acc_0.get_thread_buffer()[number<1>{}],
acc_0.get_thread_buffer()[number<2>{}],
acc_0.get_thread_buffer()[number<3>{}],
acc_0.get_thread_buffer()[number<4>{}],
acc_0.get_thread_buffer()[number<5>{}],
acc_0.get_thread_buffer()[number<6>{}],
acc_0.get_thread_buffer()[number<7>{}],
acc_0.get_thread_buffer()[number<8 + 0>{}],
acc_0.get_thread_buffer()[number<8 + 1>{}],
acc_0.get_thread_buffer()[number<8 + 2>{}],
acc_0.get_thread_buffer()[number<8 + 3>{}],
acc_0.get_thread_buffer()[number<8 + 4>{}],
acc_0.get_thread_buffer()[number<8 + 5>{}],
acc_0.get_thread_buffer()[number<8 + 6>{}],
acc_0.get_thread_buffer()[number<8 + 7>{}]);
#endif
auto
y_pre
=
cast_tile
<
YDataType
>
(
acc_0
);
store_tile
(
bridge_sst_win
,
y_pre
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment