Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
6f7d1272
Commit
6f7d1272
authored
Jan 05, 2025
by
shengnxu
Browse files
changed all the scale outside except for uq
parent
9a46c0e7
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
671 additions
and
834 deletions
+671
-834
include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
...ps/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
+7
-17
include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp
...flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp
+20
-15
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc
...uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc
+102
-66
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc
...uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc
+436
-466
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
...lock/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
+45
-241
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp
...ed_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp
+61
-29
No files found.
include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
View file @
6f7d1272
...
...
@@ -245,13 +245,10 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
template
<
typename
A
Token_id
,
typename
AQRes
,
typename
DQRes
,
typename
GQRes
,
typename
SMQRes
,
typename
ARes
,
typename
ACoords
,
typename
BRes
,
typename
BCoords
>
template
<
typename
A
scale
,
typename
GQscale
,
typename
ARes
,
typename
ACoords
,
typename
BRes
,
typename
BCoords
>
CK_TILE_DEVICE
auto
operator
()(
const
AToken_id
&
row_ids_a_
,
const
AQRes
&
res_aq
,
const
DQRes
&
res_dq
,
const
GQRes
&
res_gq
,
const
SMQRes
&
res_smq
,
operator
()(
const
Ascale
&
a_scale_
,
const
GQscale
&
gq_scale_
,
const
ARes
&
res_a
,
const
ACoords
&
cached_coords_a
,
const
BRes
&
res_b
,
...
...
@@ -263,7 +260,6 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
{
static_assert
(
ACoords
::
size
()
==
Block_M
*
Block_K
/
BlockSize
/
4
/*2x per dword*/
);
// 8
static_assert
(
BCoords
::
size
()
==
Repeat_N
);
static_assert
(
AToken_id
::
size
()
==
Repeat_M
);
static_assert
(
Ascale
::
size
()
==
Repeat_M
);
auto
a_sst
=
make_tile_window
(
...
...
@@ -372,10 +368,6 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
register
int
v_z61
asm
(
"v189"
)
=
0
;
register
int
v_z62
asm
(
"v190"
)
=
0
;
register
int
v_z63
asm
(
"v191"
)
=
0
;
index_t
temp0
=
static_cast
<
index_t
>
(
row_ids_a_
[
number
<
0
>
{}]);
index_t
temp1
=
static_cast
<
index_t
>
(
row_ids_a_
[
number
<
1
>
{}]);
// B nr->kr
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
...
...
@@ -449,13 +441,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
[
c61
]
"+v"
(
v_z61
),
[
c62
]
"+v"
(
v_z62
),
[
c63
]
"+v"
(
v_z63
),
[
v_token_id0
]
"+v"
(
temp0
),
[
v_token_id1
]
"+v"
(
temp1
),
[
s_mem_
]
"+r"
(
smem
)
:
[
s_res_aq
]
"
s
"
(
res_aq
),
[
s_res_dq
]
"
s
"
(
res_dq
),
[
s_res_gq
]
"
s
"
(
res_gq
),
[
s_res_smq
]
"
s
"
(
res_smq
),
:
[
a_scale0
]
"
v
"
(
a_scale_
[
0
]
),
[
a_scale1
]
"
v
"
(
a_scale_
[
1
]
),
[
gq_scale0
]
"
v
"
(
gq_scale_
[
0
]
),
[
gq_scale1
]
"
v
"
(
gq_scale_
[
1
]
),
[
s_res_a
]
"s"
(
res_a
),
// [s_res_a1]"s"(res_a[1]),
// [s_res_a2]"s"(res_a[2]),
...
...
include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp
View file @
6f7d1272
...
...
@@ -80,21 +80,25 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template
<
typename
DQRes
,
typename
BRes
,
typename
DQCoords
,
typename
BCoords
,
typename
ORes
,
typename
OCoords
,
typename
OFlags
>
// typename ScaleTensor>
typename
OFlags
,
typename
ScaleTensor
,
typename
YScaleTensor
>
CK_TILE_DEVICE
auto
operator
()(
const
DQRes
&
res_dq
,
const
BRes
&
res_b
,
const
DQCoords
&
cached_coords_dq
,
const
BCoords
&
cached_coords_b
,
const
ORes
&
res_o
,
const
OCoords
&
cached_coords_o
,
const
OFlags
&
o_flags
,
// this should be in sgpr
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
n
,
// loop along n dim
// const ScaleTensor& scale_,
const
ScaleTensor
&
scale_
,
const
YScaleTensor
&
smq_scale_
,
index_t
tile_offset_dq
,
index_t
tile_offset_b
,
// stride b is fixed to blockKr * blockW, but still can adjust
index_t
tile_offset_half_b
,
//splited load alone K in to 2 part
...
...
@@ -108,9 +112,9 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
const
index_t
tile_stride_o_bytes
=
tile_offset_o
*
sizeof
(
ODataType
);
const
index_t
tile_stride_dq_bytes
=
tile_offset_dq
*
sizeof
(
DScaleDataType
);
//
static_assert(ScaleTensor::size() == 2);
//
float s0 = scale_[number<0>{}];
//
float s1 = scale_[number<1>{}];
static_assert
(
ScaleTensor
::
size
()
==
2
);
float
s0
=
scale_
[
number
<
0
>
{}];
float
s1
=
scale_
[
number
<
1
>
{}];
index_t
loop_cnt
=
n
;
...
...
@@ -220,8 +224,10 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// [v_sld_y_os]"v"(sld_y_os),
// [v_sfl_sld]"v"(sfl_sld),
// [v_sfl_sst]"v"(sfl_sst),
[
smq_scale0
]
"s"
(
smq_scale_
[
0
]),
[
smq_scale1
]
"s"
(
smq_scale_
[
1
]),
[
s_res_dq
]
"s"
(
res_dq
),
[
s_res_o0
]
"s"
(
res_o
[
0
]),
[
s_res_o0
]
"s"
(
res_o
[
0
]),
[
s_res_o1
]
"s"
(
res_o
[
1
]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
...
...
@@ -229,6 +235,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// [s_res_b1]"s"(res_b[1]),
// [s_res_b2]"s"(res_b[2]),
// [s_res_b3]"s"(res_b[3]),
[
v_os_dq
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_dq
*
sizeof
(
DScaleDataType
))),
[
v_os_o0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
0
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
1
>
{}]
*
sizeof
(
ODataType
))),
[
v_os_o2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_o
[
number
<
2
>
{}]
*
sizeof
(
ODataType
))),
...
...
@@ -293,8 +300,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
"s55"
,
"s56"
,
"s57"
,
"s58"
,
"s59"
,
"s60"
,
"s61"
,
"s62"
,
"s63"
,
"s64"
,
"s65"
,
"s66"
,
"s67"
,
"s68"
,
"s69"
,
"s70"
,
"s71"
,
"s72"
,
"s73"
,
"s74"
,
"s75"
,
"s76"
,
"s77"
,
"s78"
,
"s79"
,
"s80"
,
// s86 as tmp
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v50"
,
"v51"
,
"v52"
,
"v53"
,
"v54"
,
"v55"
,
"v56"
,
"v57"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
...
...
@@ -366,8 +372,8 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
// [v_sfl_sld]"v"(sfl_sld),
// [v_sfl_sst]"v"(sfl_sst),
[
s_res_dq
]
"s"
(
res_dq
),
[
s_res_o0
]
"s"
(
res_o
[
0
]),
[
s_res_o1
]
"s"
(
res_o
[
1
]),
[
s_res_o0
]
"s"
(
res_o
[
0
]),
[
s_res_o1
]
"s"
(
res_o
[
1
]),
//[s_res_o2]"s"(res_o[2]),
//[s_res_o3]"s"(res_o[3]),
[
s_res_d
]
"s"
(
res_b
),
...
...
@@ -390,8 +396,8 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
[
s_tile_os_b_half
]
"s"
(
tile_offset_half_b_bytes
),
[
s_tile_os_b
]
"s"
(
tile_stride_b_bytes
),
[
s_tile_os_dq
]
"s"
(
tile_stride_dq_bytes
),
//
[scale_0]"v"(s0),
//
[scale_1]"v"(s1),
[
scale_0
]
"v"
(
s0
),
[
scale_1
]
"v"
(
s1
),
// [v_nan_lo]"v"(nan_lo),
// [v_nan_hi]"v"(nan_hi),
[
s_execflag_0
]
"s"
(
o_flags
[
number
<
0
>
{}]),
...
...
@@ -438,8 +444,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
"s55"
,
"s56"
,
"s57"
,
"s58"
,
"s59"
,
"s60"
,
"s61"
,
"s62"
,
"s63"
,
"s64"
,
"s65"
,
"s66"
,
"s67"
,
"s68"
,
"s69"
,
"s70"
,
"s71"
,
"s72"
,
"s73"
,
"s74"
,
"s75"
,
"s76"
,
"s77"
,
"s78"
,
"s79"
,
"s80"
,
// s86 as tmp
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v50"
,
"v51"
,
"v52"
,
"v53"
,
"v54"
,
"v55"
,
"v56"
,
"v57"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
...
...
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc
View file @
6f7d1272
...
...
@@ -27,8 +27,12 @@
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
" v_and_b32 v0, 0x3f, v0
\n
"
" v_lshrrev_b32 v3, 6, v0
\n
"
" v_readfirstlane_b32 s7, v3
\n
"
" s_waitcnt vmcnt(24)
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_mul_f32 v54, v128, v128
\n
"
" v_mul_f32 v55, v129, v129
\n
"
" v_mul_f32 v56, v130, v130
\n
"
...
...
@@ -49,7 +53,6 @@
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
...
...
@@ -577,71 +580,71 @@
" v_mul_f32 v189, v189, v55
\n
"
" v_mul_f32 v190, v190, v56
\n
"
" v_mul_f32 v191, v191, v57
\n
"
" v_mul_f32 v128,
v18
, v128 row_newbcast:0
\n
"
" v_mul_f32 v129,
v18
, v129 row_newbcast:1
\n
"
" v_mul_f32 v130,
v18
, v130 row_newbcast:2
\n
"
" v_mul_f32 v131,
v18
, v131 row_newbcast:3
\n
"
" v_mul_f32 v132,
v18
, v132 row_newbcast:0
\n
"
" v_mul_f32 v133,
v18
, v133 row_newbcast:1
\n
"
" v_mul_f32 v134,
v18
, v134 row_newbcast:2
\n
"
" v_mul_f32 v135,
v18
, v135 row_newbcast:3
\n
"
" v_mul_f32 v136,
v18
, v136 row_newbcast:4
\n
"
" v_mul_f32 v137,
v18
, v137 row_newbcast:5
\n
"
" v_mul_f32 v138,
v18
, v138 row_newbcast:6
\n
"
" v_mul_f32 v139,
v18
, v139 row_newbcast:7
\n
"
" v_mul_f32 v140,
v18
, v140 row_newbcast:4
\n
"
" v_mul_f32 v141,
v18
, v141 row_newbcast:5
\n
"
" v_mul_f32 v142,
v18
, v142 row_newbcast:6
\n
"
" v_mul_f32 v143,
v18
, v143 row_newbcast:7
\n
"
" v_mul_f32 v144,
v18
, v144 row_newbcast:8
\n
"
" v_mul_f32 v145,
v18
, v145 row_newbcast:9
\n
"
" v_mul_f32 v146,
v18
, v146 row_newbcast:10
\n
"
" v_mul_f32 v147,
v18
, v147 row_newbcast:11
\n
"
" v_mul_f32 v148,
v18
, v148 row_newbcast:8
\n
"
" v_mul_f32 v149,
v18
, v149 row_newbcast:9
\n
"
" v_mul_f32 v150,
v18
, v150 row_newbcast:10
\n
"
" v_mul_f32 v151,
v18
, v151 row_newbcast:11
\n
"
" v_mul_f32 v152,
v18
, v152 row_newbcast:12
\n
"
" v_mul_f32 v153,
v18
, v153 row_newbcast:13
\n
"
" v_mul_f32 v154,
v18
, v154 row_newbcast:14
\n
"
" v_mul_f32 v155,
v18
, v155 row_newbcast:15
\n
"
" v_mul_f32 v156,
v18
, v156 row_newbcast:12
\n
"
" v_mul_f32 v157,
v18
, v157 row_newbcast:13
\n
"
" v_mul_f32 v158,
v18
, v158 row_newbcast:14
\n
"
" v_mul_f32 v159,
v18
, v159 row_newbcast:15
\n
"
" v_mul_f32 v160,
v19
, v160 row_newbcast:0
\n
"
" v_mul_f32 v161,
v19
, v161 row_newbcast:1
\n
"
" v_mul_f32 v162,
v19
, v162 row_newbcast:2
\n
"
" v_mul_f32 v163,
v19
, v163 row_newbcast:3
\n
"
" v_mul_f32 v164,
v19
, v164 row_newbcast:0
\n
"
" v_mul_f32 v165,
v19
, v165 row_newbcast:1
\n
"
" v_mul_f32 v166,
v19
, v166 row_newbcast:2
\n
"
" v_mul_f32 v167,
v19
, v167 row_newbcast:3
\n
"
" v_mul_f32 v168,
v19
, v168 row_newbcast:4
\n
"
" v_mul_f32 v169,
v19
, v169 row_newbcast:5
\n
"
" v_mul_f32 v170,
v19
, v170 row_newbcast:6
\n
"
" v_mul_f32 v171,
v19
, v171 row_newbcast:7
\n
"
" v_mul_f32 v172,
v19
, v172 row_newbcast:4
\n
"
" v_mul_f32 v173,
v19
, v173 row_newbcast:5
\n
"
" v_mul_f32 v174,
v19
, v174 row_newbcast:6
\n
"
" v_mul_f32 v175,
v19
, v175 row_newbcast:7
\n
"
" v_mul_f32 v176,
v19
, v176 row_newbcast:8
\n
"
" v_mul_f32 v177,
v19
, v177 row_newbcast:9
\n
"
" v_mul_f32 v178,
v19
, v178 row_newbcast:10
\n
"
" v_mul_f32 v179,
v19
, v179 row_newbcast:11
\n
"
" v_mul_f32 v180,
v19
, v180 row_newbcast:8
\n
"
" v_mul_f32 v181,
v19
, v181 row_newbcast:9
\n
"
" v_mul_f32 v182,
v19
, v182 row_newbcast:10
\n
"
" v_mul_f32 v183,
v19
, v183 row_newbcast:11
\n
"
" v_mul_f32 v184,
v19
, v184 row_newbcast:12
\n
"
" v_mul_f32 v185,
v19
, v185 row_newbcast:13
\n
"
" v_mul_f32 v186,
v19
, v186 row_newbcast:14
\n
"
" v_mul_f32 v187,
v19
, v187 row_newbcast:15
\n
"
" v_mul_f32 v188,
v19
, v188 row_newbcast:12
\n
"
" v_mul_f32 v189,
v19
, v189 row_newbcast:13
\n
"
" v_mul_f32 v190,
v19
, v190 row_newbcast:14
\n
"
" v_mul_f32 v191,
v19
, v191 row_newbcast:15
\n
"
" buffer_load_dword v12,
v5
, %[s_res_dq], 0 offen
\n
"
" v_mul_f32 v128,
%[smq_scale0]
, v128 row_newbcast:0
\n
"
" v_mul_f32 v129,
%[smq_scale0]
, v129 row_newbcast:1
\n
"
" v_mul_f32 v130,
%[smq_scale0]
, v130 row_newbcast:2
\n
"
" v_mul_f32 v131,
%[smq_scale0]
, v131 row_newbcast:3
\n
"
" v_mul_f32 v132,
%[smq_scale0]
, v132 row_newbcast:0
\n
"
" v_mul_f32 v133,
%[smq_scale0]
, v133 row_newbcast:1
\n
"
" v_mul_f32 v134,
%[smq_scale0]
, v134 row_newbcast:2
\n
"
" v_mul_f32 v135,
%[smq_scale0]
, v135 row_newbcast:3
\n
"
" v_mul_f32 v136,
%[smq_scale0]
, v136 row_newbcast:4
\n
"
" v_mul_f32 v137,
%[smq_scale0]
, v137 row_newbcast:5
\n
"
" v_mul_f32 v138,
%[smq_scale0]
, v138 row_newbcast:6
\n
"
" v_mul_f32 v139,
%[smq_scale0]
, v139 row_newbcast:7
\n
"
" v_mul_f32 v140,
%[smq_scale0]
, v140 row_newbcast:4
\n
"
" v_mul_f32 v141,
%[smq_scale0]
, v141 row_newbcast:5
\n
"
" v_mul_f32 v142,
%[smq_scale0]
, v142 row_newbcast:6
\n
"
" v_mul_f32 v143,
%[smq_scale0]
, v143 row_newbcast:7
\n
"
" v_mul_f32 v144,
%[smq_scale0]
, v144 row_newbcast:8
\n
"
" v_mul_f32 v145,
%[smq_scale0]
, v145 row_newbcast:9
\n
"
" v_mul_f32 v146,
%[smq_scale0]
, v146 row_newbcast:10
\n
"
" v_mul_f32 v147,
%[smq_scale0]
, v147 row_newbcast:11
\n
"
" v_mul_f32 v148,
%[smq_scale0]
, v148 row_newbcast:8
\n
"
" v_mul_f32 v149,
%[smq_scale0]
, v149 row_newbcast:9
\n
"
" v_mul_f32 v150,
%[smq_scale0]
, v150 row_newbcast:10
\n
"
" v_mul_f32 v151,
%[smq_scale0]
, v151 row_newbcast:11
\n
"
" v_mul_f32 v152,
%[smq_scale0]
, v152 row_newbcast:12
\n
"
" v_mul_f32 v153,
%[smq_scale0]
, v153 row_newbcast:13
\n
"
" v_mul_f32 v154,
%[smq_scale0]
, v154 row_newbcast:14
\n
"
" v_mul_f32 v155,
%[smq_scale0]
, v155 row_newbcast:15
\n
"
" v_mul_f32 v156,
%[smq_scale0]
, v156 row_newbcast:12
\n
"
" v_mul_f32 v157,
%[smq_scale0]
, v157 row_newbcast:13
\n
"
" v_mul_f32 v158,
%[smq_scale0]
, v158 row_newbcast:14
\n
"
" v_mul_f32 v159,
%[smq_scale0]
, v159 row_newbcast:15
\n
"
" v_mul_f32 v160,
%[smq_scale1]
, v160 row_newbcast:0
\n
"
" v_mul_f32 v161,
%[smq_scale1]
, v161 row_newbcast:1
\n
"
" v_mul_f32 v162,
%[smq_scale1]
, v162 row_newbcast:2
\n
"
" v_mul_f32 v163,
%[smq_scale1]
, v163 row_newbcast:3
\n
"
" v_mul_f32 v164,
%[smq_scale1]
, v164 row_newbcast:0
\n
"
" v_mul_f32 v165,
%[smq_scale1]
, v165 row_newbcast:1
\n
"
" v_mul_f32 v166,
%[smq_scale1]
, v166 row_newbcast:2
\n
"
" v_mul_f32 v167,
%[smq_scale1]
, v167 row_newbcast:3
\n
"
" v_mul_f32 v168,
%[smq_scale1]
, v168 row_newbcast:4
\n
"
" v_mul_f32 v169,
%[smq_scale1]
, v169 row_newbcast:5
\n
"
" v_mul_f32 v170,
%[smq_scale1]
, v170 row_newbcast:6
\n
"
" v_mul_f32 v171,
%[smq_scale1]
, v171 row_newbcast:7
\n
"
" v_mul_f32 v172,
%[smq_scale1]
, v172 row_newbcast:4
\n
"
" v_mul_f32 v173,
%[smq_scale1]
, v173 row_newbcast:5
\n
"
" v_mul_f32 v174,
%[smq_scale1]
, v174 row_newbcast:6
\n
"
" v_mul_f32 v175,
%[smq_scale1]
, v175 row_newbcast:7
\n
"
" v_mul_f32 v176,
%[smq_scale1]
, v176 row_newbcast:8
\n
"
" v_mul_f32 v177,
%[smq_scale1]
, v177 row_newbcast:9
\n
"
" v_mul_f32 v178,
%[smq_scale1]
, v178 row_newbcast:10
\n
"
" v_mul_f32 v179,
%[smq_scale1]
, v179 row_newbcast:11
\n
"
" v_mul_f32 v180,
%[smq_scale1]
, v180 row_newbcast:8
\n
"
" v_mul_f32 v181,
%[smq_scale1]
, v181 row_newbcast:9
\n
"
" v_mul_f32 v182,
%[smq_scale1]
, v182 row_newbcast:10
\n
"
" v_mul_f32 v183,
%[smq_scale1]
, v183 row_newbcast:11
\n
"
" v_mul_f32 v184,
%[smq_scale1]
, v184 row_newbcast:12
\n
"
" v_mul_f32 v185,
%[smq_scale1]
, v185 row_newbcast:13
\n
"
" v_mul_f32 v186,
%[smq_scale1]
, v186 row_newbcast:14
\n
"
" v_mul_f32 v187,
%[smq_scale1]
, v187 row_newbcast:15
\n
"
" v_mul_f32 v188,
%[smq_scale1]
, v188 row_newbcast:12
\n
"
" v_mul_f32 v189,
%[smq_scale1]
, v189 row_newbcast:13
\n
"
" v_mul_f32 v190,
%[smq_scale1]
, v190 row_newbcast:14
\n
"
" v_mul_f32 v191,
%[smq_scale1]
, v191 row_newbcast:15
\n
"
" buffer_load_dword v12,
%[v_os_dq]
, %[s_res_dq], 0 offen
\n
"
" v_mov_b32 v22, 0x358637bd
\n
"
" v_mov_b32 v23, 0x358637bd
\n
"
" v_max3_f32 v22, abs(v128), abs(v129), v22
\n
"
...
...
@@ -934,9 +937,42 @@
" v_lshlrev_b32 v54, 1, v54
\n
"
" v_add_u32 v55, v54, v55
\n
"
" v_lshlrev_b32 v54, 2, v55
\n
"
" ds_read_b64 v[128:129], v54 offset:18688
\n
"
" ds_read_b64 v[130:131], v54 offset:18816
\n
"
" ds_read_b64 v[132:133], v54 offset:19712
\n
"
" ds_read_b64 v[134:135], v54 offset:19840
\n
"
" ds_read_b64 v[136:137], v54 offset:20736
\n
"
" ds_read_b64 v[138:139], v54 offset:20864
\n
"
" ds_read_b64 v[140:141], v54 offset:21760
\n
"
" ds_read_b64 v[142:143], v54 offset:21888
\n
"
" ds_read_b64 v[144:145], v54 offset:22784
\n
"
" ds_read_b64 v[146:147], v54 offset:22912
\n
"
" ds_read_b64 v[148:149], v54 offset:23808
\n
"
" ds_read_b64 v[150:151], v54 offset:23936
\n
"
" ds_read_b64 v[152:153], v54 offset:24832
\n
"
" ds_read_b64 v[154:155], v54 offset:24960
\n
"
" ds_read_b64 v[156:157], v54 offset:25856
\n
"
" ds_read_b64 v[158:159], v54 offset:25984
\n
"
" ds_read_b64 v[160:161], v54 offset:26880
\n
"
" ds_read_b64 v[162:163], v54 offset:27008
\n
"
" ds_read_b64 v[164:165], v54 offset:27904
\n
"
" ds_read_b64 v[166:167], v54 offset:28032
\n
"
" ds_read_b64 v[168:169], v54 offset:28928
\n
"
" ds_read_b64 v[170:171], v54 offset:29056
\n
"
" ds_read_b64 v[172:173], v54 offset:29952
\n
"
" ds_read_b64 v[174:175], v54 offset:30080
\n
"
" ds_read_b64 v[176:177], v54 offset:30976
\n
"
" ds_read_b64 v[178:179], v54 offset:31104
\n
"
" ds_read_b64 v[180:181], v54 offset:32000
\n
"
" ds_read_b64 v[182:183], v54 offset:32128
\n
"
" ds_read_b64 v[184:185], v54 offset:33024
\n
"
" ds_read_b64 v[186:187], v54 offset:33152
\n
"
" ds_read_b64 v[188:189], v54 offset:34048
\n
"
" ds_read_b64 v[190:191], v54 offset:34176
\n
"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc
View file @
6f7d1272
...
...
@@ -27,222 +27,190 @@
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
" ds_read_b64 v[128:129], v54 offset:18688
\n
"
" ds_read_b64 v[130:131], v54 offset:18816
\n
"
" ds_read_b64 v[132:133], v54 offset:19712
\n
"
" ds_read_b64 v[134:135], v54 offset:19840
\n
"
" ds_read_b64 v[136:137], v54 offset:20736
\n
"
" ds_read_b64 v[138:139], v54 offset:20864
\n
"
" ds_read_b64 v[140:141], v54 offset:21760
\n
"
" ds_read_b64 v[142:143], v54 offset:21888
\n
"
" ds_read_b64 v[144:145], v54 offset:22784
\n
"
" ds_read_b64 v[146:147], v54 offset:22912
\n
"
" ds_read_b64 v[148:149], v54 offset:23808
\n
"
" ds_read_b64 v[150:151], v54 offset:23936
\n
"
" ds_read_b64 v[152:153], v54 offset:24832
\n
"
" ds_read_b64 v[154:155], v54 offset:24960
\n
"
" ds_read_b64 v[156:157], v54 offset:25856
\n
"
" ds_read_b64 v[158:159], v54 offset:25984
\n
"
" ds_read_b64 v[160:161], v54 offset:26880
\n
"
" ds_read_b64 v[162:163], v54 offset:27008
\n
"
" ds_read_b64 v[164:165], v54 offset:27904
\n
"
" ds_read_b64 v[166:167], v54 offset:28032
\n
"
" ds_read_b64 v[168:169], v54 offset:28928
\n
"
" ds_read_b64 v[170:171], v54 offset:29056
\n
"
" ds_read_b64 v[172:173], v54 offset:29952
\n
"
" ds_read_b64 v[174:175], v54 offset:30080
\n
"
" ds_read_b64 v[176:177], v54 offset:30976
\n
"
" ds_read_b64 v[178:179], v54 offset:31104
\n
"
" ds_read_b64 v[180:181], v54 offset:32000
\n
"
" ds_read_b64 v[182:183], v54 offset:32128
\n
"
" ds_read_b64 v[184:185], v54 offset:33024
\n
"
" ds_read_b64 v[186:187], v54 offset:33152
\n
"
" ds_read_b64 v[188:189], v54 offset:34048
\n
"
" ds_read_b64 v[190:191], v54 offset:34176
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s16, %[s_tile_os_dq], s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_add_u32 %[s_res_d][0], %[s_tile_os_b], %[s_res_d][0]
\n
"
" s_addc_u32 %[s_res_d][1], 0, %[s_res_d][1]
\n
"
" s_add_u32 %[s_res_dq][0], %[s_tile_os_dq], %[s_res_dq][0]
\n
"
" s_addc_u32 %[s_res_dq][1], 0, %[s_res_dq][1]
\n
"
" s_mov_b32 s80, 0
\n
"
" s_waitcnt 0x0000
\n
"
"label_startgemm2:
\n
"
" s_waitcnt vmcnt(41)
\n
"
" s_barrier
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], %[s_res_d], 0 offen offset:3072
\n
"
" s_add_u32
s12
, %[s_tile_os_b_half],
s12
\n
"
" s_addc_u32
s13, 0, s13
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[62:63], v[174:175], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], %[s_res_d], 0 offen offset:3072
\n
"
" s_add_u32
%[s_res_d][0]
, %[s_tile_os_b_half],
%[s_res_d][0]
\n
"
" s_addc_u32
%[s_res_d][1], 0, %[s_res_d][1]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[62:63], v[174:175], v[220:223]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b0], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b0], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b0], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b1], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b1], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b1], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b1], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b0], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b0], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b0], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b1], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b1], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b1], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b1], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207]
\n
"
" s_waitcnt vmcnt(40)
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b2], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211]
\n
"
" buffer_load_dword v13,
v5
, %[s_res_dq], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b2], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b2], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b2], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b3], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b3], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b3], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b3], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b2], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211]
\n
"
" buffer_load_dword v13,
%[v_os_dq]
, %[s_res_dq], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b2], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b2], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b2], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b3], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b3], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b3], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b3], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223]
\n
"
" s_add_u32 s60, 0x00000200, s80
\n
"
" s_cmp_lt_u32 s60, s81
\n
"
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0
\n
"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0
\n
"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0
\n
"
" s_add_u32
s12
, %[s_tile_os_b],
s12
\n
"
" s_addc_u32
s13, 0, s13
\n
"
" s_add_u32
s16
, %[s_tile_os_dq],
s16
\n
"
" s_addc_u32
s17, 0, s17
\n
"
" s_add_u32
%[s_res_d][0]
, %[s_tile_os_b],
%[s_res_d][0]
\n
"
" s_addc_u32
%[s_res_d][1], 0, %[s_res_d][1]
\n
"
" s_add_u32
%[s_res_dq][0]
, %[s_tile_os_dq],
%[s_res_dq][0]
\n
"
" s_addc_u32
%[s_res_dq][1], 0, %[s_res_dq][1]
\n
"
" v_cvt_f32_i32 v192, v192
\n
"
" v_cvt_f32_i32 v193, v193
\n
"
" v_cvt_f32_i32 v194, v194
\n
"
...
...
@@ -255,10 +223,10 @@
" v_mul_f32 v193, v12, v193 row_newbcast:1
\n
"
" v_mul_f32 v194, v12, v194 row_newbcast:2
\n
"
" v_mul_f32 v195, v12, v195 row_newbcast:3
\n
"
" v_mul_f32 v192,
v20
, v192
\n
"
" v_mul_f32 v193,
v20
, v193
\n
"
" v_mul_f32 v194,
v20
, v194
\n
"
" v_mul_f32 v195,
v20
, v195
\n
"
" v_mul_f32 v192,
%[scale_0]
, v192
\n
"
" v_mul_f32 v193,
%[scale_0]
, v193
\n
"
" v_mul_f32 v194,
%[scale_0]
, v194
\n
"
" v_mul_f32 v195,
%[scale_0]
, v195
\n
"
" v_cvt_f32_i32 v196, v196
\n
"
" v_cvt_f32_i32 v197, v197
\n
"
" v_cvt_f32_i32 v198, v198
\n
"
...
...
@@ -271,10 +239,10 @@
" v_mul_f32 v197, v12, v197 row_newbcast:1
\n
"
" v_mul_f32 v198, v12, v198 row_newbcast:2
\n
"
" v_mul_f32 v199, v12, v199 row_newbcast:3
\n
"
" v_mul_f32 v196,
v21
, v196
\n
"
" v_mul_f32 v197,
v21
, v197
\n
"
" v_mul_f32 v198,
v21
, v198
\n
"
" v_mul_f32 v199,
v21
, v199
\n
"
" v_mul_f32 v196,
%[scale_1]
, v196
\n
"
" v_mul_f32 v197,
%[scale_1]
, v197
\n
"
" v_mul_f32 v198,
%[scale_1]
, v198
\n
"
" v_mul_f32 v199,
%[scale_1]
, v199
\n
"
" v_cvt_f32_i32 v200, v200
\n
"
" v_cvt_f32_i32 v201, v201
\n
"
" v_cvt_f32_i32 v202, v202
\n
"
...
...
@@ -287,10 +255,10 @@
" v_mul_f32 v201, v12, v201 row_newbcast:5
\n
"
" v_mul_f32 v202, v12, v202 row_newbcast:6
\n
"
" v_mul_f32 v203, v12, v203 row_newbcast:7
\n
"
" v_mul_f32 v200,
v20
, v200
\n
"
" v_mul_f32 v201,
v20
, v201
\n
"
" v_mul_f32 v202,
v20
, v202
\n
"
" v_mul_f32 v203,
v20
, v203
\n
"
" v_mul_f32 v200,
%[scale_0]
, v200
\n
"
" v_mul_f32 v201,
%[scale_0]
, v201
\n
"
" v_mul_f32 v202,
%[scale_0]
, v202
\n
"
" v_mul_f32 v203,
%[scale_0]
, v203
\n
"
" v_cvt_f32_i32 v204, v204
\n
"
" v_cvt_f32_i32 v205, v205
\n
"
" v_cvt_f32_i32 v206, v206
\n
"
...
...
@@ -303,10 +271,10 @@
" v_mul_f32 v205, v12, v205 row_newbcast:5
\n
"
" v_mul_f32 v206, v12, v206 row_newbcast:6
\n
"
" v_mul_f32 v207, v12, v207 row_newbcast:7
\n
"
" v_mul_f32 v204,
v21
, v204
\n
"
" v_mul_f32 v205,
v21
, v205
\n
"
" v_mul_f32 v206,
v21
, v206
\n
"
" v_mul_f32 v207,
v21
, v207
\n
"
" v_mul_f32 v204,
%[scale_1]
, v204
\n
"
" v_mul_f32 v205,
%[scale_1]
, v205
\n
"
" v_mul_f32 v206,
%[scale_1]
, v206
\n
"
" v_mul_f32 v207,
%[scale_1]
, v207
\n
"
" v_cvt_f32_i32 v208, v208
\n
"
" v_cvt_f32_i32 v209, v209
\n
"
" v_cvt_f32_i32 v210, v210
\n
"
...
...
@@ -319,10 +287,10 @@
" v_mul_f32 v209, v12, v209 row_newbcast:9
\n
"
" v_mul_f32 v210, v12, v210 row_newbcast:10
\n
"
" v_mul_f32 v211, v12, v211 row_newbcast:11
\n
"
" v_mul_f32 v208,
v20
, v208
\n
"
" v_mul_f32 v209,
v20
, v209
\n
"
" v_mul_f32 v210,
v20
, v210
\n
"
" v_mul_f32 v211,
v20
, v211
\n
"
" v_mul_f32 v208,
%[scale_0]
, v208
\n
"
" v_mul_f32 v209,
%[scale_0]
, v209
\n
"
" v_mul_f32 v210,
%[scale_0]
, v210
\n
"
" v_mul_f32 v211,
%[scale_0]
, v211
\n
"
" v_cvt_f32_i32 v212, v212
\n
"
" v_cvt_f32_i32 v213, v213
\n
"
" v_cvt_f32_i32 v214, v214
\n
"
...
...
@@ -335,10 +303,10 @@
" v_mul_f32 v213, v12, v213 row_newbcast:9
\n
"
" v_mul_f32 v214, v12, v214 row_newbcast:10
\n
"
" v_mul_f32 v215, v12, v215 row_newbcast:11
\n
"
" v_mul_f32 v212,
v21
, v212
\n
"
" v_mul_f32 v213,
v21
, v213
\n
"
" v_mul_f32 v214,
v21
, v214
\n
"
" v_mul_f32 v215,
v21
, v215
\n
"
" v_mul_f32 v212,
%[scale_1]
, v212
\n
"
" v_mul_f32 v213,
%[scale_1]
, v213
\n
"
" v_mul_f32 v214,
%[scale_1]
, v214
\n
"
" v_mul_f32 v215,
%[scale_1]
, v215
\n
"
" v_cvt_f32_i32 v216, v216
\n
"
" v_cvt_f32_i32 v217, v217
\n
"
" v_cvt_f32_i32 v218, v218
\n
"
...
...
@@ -351,10 +319,10 @@
" v_mul_f32 v217, v12, v217 row_newbcast:13
\n
"
" v_mul_f32 v218, v12, v218 row_newbcast:14
\n
"
" v_mul_f32 v219, v12, v219 row_newbcast:15
\n
"
" v_mul_f32 v216,
v20
, v216
\n
"
" v_mul_f32 v217,
v20
, v217
\n
"
" v_mul_f32 v218,
v20
, v218
\n
"
" v_mul_f32 v219,
v20
, v219
\n
"
" v_mul_f32 v216,
%[scale_0]
, v216
\n
"
" v_mul_f32 v217,
%[scale_0]
, v217
\n
"
" v_mul_f32 v218,
%[scale_0]
, v218
\n
"
" v_mul_f32 v219,
%[scale_0]
, v219
\n
"
" v_cvt_f32_i32 v220, v220
\n
"
" v_cvt_f32_i32 v221, v221
\n
"
" v_cvt_f32_i32 v222, v222
\n
"
...
...
@@ -367,10 +335,10 @@
" v_mul_f32 v221, v12, v221 row_newbcast:13
\n
"
" v_mul_f32 v222, v12, v222 row_newbcast:14
\n
"
" v_mul_f32 v223, v12, v223 row_newbcast:15
\n
"
" v_mul_f32 v220,
v21
, v220
\n
"
" v_mul_f32 v221,
v21
, v221
\n
"
" v_mul_f32 v222,
v21
, v222
\n
"
" v_mul_f32 v223,
v21
, v223
\n
"
" v_mul_f32 v220,
%[scale_1]
, v220
\n
"
" v_mul_f32 v221,
%[scale_1]
, v221
\n
"
" v_mul_f32 v222,
%[scale_1]
, v222
\n
"
" v_mul_f32 v223,
%[scale_1]
, v223
\n
"
" v_cmp_u_f32 s[48:49], v192, v192
\n
"
" v_add3_u32 v50, v192, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
...
...
@@ -511,52 +479,52 @@
" ds_read_b32 v79, v4 offset:48224
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16
v80
, v64, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o0]
, v64, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16
v80
, v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o0]
, v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16
v82
, v66, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o1]
, v66, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16
v82
, v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o1]
, v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16
v84
, v68, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o3]
, v68, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16
v84
, v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o3]
, v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16
v86
, v70, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o4]
, v70, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16
v86
, v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o4]
, v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16
v88
, v72, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o5]
, v72, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16
v88
, v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o5]
, v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16
v90
, v74, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o6]
, v74, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16
v90
, v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o6]
, v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16
v92
, v76, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o7]
, v76, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16
v92
, v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o7]
, v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16
v94
, v78, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o8]
, v78, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16
v94
, v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o8]
, v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_add_u32 %[s_res_o0], s59, %[s_res_o0]
\n
"
" s_addc_u32 %[s_res_o1], 0, %[s_res_o1]
\n
"
...
...
@@ -565,181 +533,181 @@
" s_cbranch_scc0 label_end_gemm2
\n
"
" s_waitcnt vmcnt(41)
\n
"
" s_barrier
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_d], 0 offen offset:3072
\n
"
" s_add_u32
s12
, %[s_tile_os_b_half],
s12
\n
"
" s_addc_u32
s13, 0, s13
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[190:191], v[174:175], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], %[s_res_d], 0 offen offset:3072
\n
"
" s_add_u32
%[s_res_d][0]
, %[s_tile_os_b_half],
%[s_res_d][0]
\n
"
" s_addc_u32
%[s_res_d][1], 0, %[s_res_d][1]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[190:191], v[174:175], v[252:255]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239]
\n
"
" s_waitcnt vmcnt(40)
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243]
\n
"
" buffer_load_dword v12,
v5
, %[s_res_dq], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243]
\n
"
" buffer_load_dword v12,
%[v_os_dq]
, %[s_res_dq], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], %[s_res_d], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], %[s_res_d], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], %[s_res_d], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], %[s_res_d], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255]
\n
"
" s_add_u32 s60, 0x00000200, s80
\n
"
" s_cmp_lt_u32 s60, s81
\n
"
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0
\n
"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0
\n
"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0
\n
"
" s_add_u32
s12
, %[s_tile_os_b],
s12
\n
"
" s_addc_u32
s13, 0, s13
\n
"
" s_add_u32
s16
, %[s_tile_os_dq],
s16
\n
"
" s_addc_u32
s17, 0, s17
\n
"
" s_add_u32
%[s_res_d][0]
, %[s_tile_os_b],
%[s_res_d][0]
\n
"
" s_addc_u32
%[s_res_d][1], 0, %[s_res_d][1]
\n
"
" s_add_u32
%[s_res_dq][0]
, %[s_tile_os_dq],
%[s_res_dq][0]
\n
"
" s_addc_u32
%[s_res_dq][1], 0, %[s_res_dq][1]
\n
"
" v_cvt_f32_i32 v224, v224
\n
"
" v_cvt_f32_i32 v225, v225
\n
"
" v_cvt_f32_i32 v226, v226
\n
"
...
...
@@ -752,10 +720,10 @@
" v_mul_f32 v225, v13, v225 row_newbcast:1
\n
"
" v_mul_f32 v226, v13, v226 row_newbcast:2
\n
"
" v_mul_f32 v227, v13, v227 row_newbcast:3
\n
"
" v_mul_f32 v224,
v20
, v224
\n
"
" v_mul_f32 v225,
v20
, v225
\n
"
" v_mul_f32 v226,
v20
, v226
\n
"
" v_mul_f32 v227,
v20
, v227
\n
"
" v_mul_f32 v224,
%[scale_0]
, v224
\n
"
" v_mul_f32 v225,
%[scale_0]
, v225
\n
"
" v_mul_f32 v226,
%[scale_0]
, v226
\n
"
" v_mul_f32 v227,
%[scale_0]
, v227
\n
"
" v_cvt_f32_i32 v228, v228
\n
"
" v_cvt_f32_i32 v229, v229
\n
"
" v_cvt_f32_i32 v230, v230
\n
"
...
...
@@ -768,10 +736,10 @@
" v_mul_f32 v229, v13, v229 row_newbcast:1
\n
"
" v_mul_f32 v230, v13, v230 row_newbcast:2
\n
"
" v_mul_f32 v231, v13, v231 row_newbcast:3
\n
"
" v_mul_f32 v228,
v21
, v228
\n
"
" v_mul_f32 v229,
v21
, v229
\n
"
" v_mul_f32 v230,
v21
, v230
\n
"
" v_mul_f32 v231,
v21
, v231
\n
"
" v_mul_f32 v228,
%[scale_1]
, v228
\n
"
" v_mul_f32 v229,
%[scale_1]
, v229
\n
"
" v_mul_f32 v230,
%[scale_1]
, v230
\n
"
" v_mul_f32 v231,
%[scale_1]
, v231
\n
"
" v_cvt_f32_i32 v232, v232
\n
"
" v_cvt_f32_i32 v233, v233
\n
"
" v_cvt_f32_i32 v234, v234
\n
"
...
...
@@ -784,10 +752,10 @@
" v_mul_f32 v233, v13, v233 row_newbcast:5
\n
"
" v_mul_f32 v234, v13, v234 row_newbcast:6
\n
"
" v_mul_f32 v235, v13, v235 row_newbcast:7
\n
"
" v_mul_f32 v232,
v20
, v232
\n
"
" v_mul_f32 v233,
v20
, v233
\n
"
" v_mul_f32 v234,
v20
, v234
\n
"
" v_mul_f32 v235,
v20
, v235
\n
"
" v_mul_f32 v232,
%[scale_0]
, v232
\n
"
" v_mul_f32 v233,
%[scale_0]
, v233
\n
"
" v_mul_f32 v234,
%[scale_0]
, v234
\n
"
" v_mul_f32 v235,
%[scale_0]
, v235
\n
"
" v_cvt_f32_i32 v236, v236
\n
"
" v_cvt_f32_i32 v237, v237
\n
"
" v_cvt_f32_i32 v238, v238
\n
"
...
...
@@ -800,10 +768,10 @@
" v_mul_f32 v237, v13, v237 row_newbcast:5
\n
"
" v_mul_f32 v238, v13, v238 row_newbcast:6
\n
"
" v_mul_f32 v239, v13, v239 row_newbcast:7
\n
"
" v_mul_f32 v236,
v21
, v236
\n
"
" v_mul_f32 v237,
v21
, v237
\n
"
" v_mul_f32 v238,
v21
, v238
\n
"
" v_mul_f32 v239,
v21
, v239
\n
"
" v_mul_f32 v236,
%[scale_1]
, v236
\n
"
" v_mul_f32 v237,
%[scale_1]
, v237
\n
"
" v_mul_f32 v238,
%[scale_1]
, v238
\n
"
" v_mul_f32 v239,
%[scale_1]
, v239
\n
"
" v_cvt_f32_i32 v240, v240
\n
"
" v_cvt_f32_i32 v241, v241
\n
"
" v_cvt_f32_i32 v242, v242
\n
"
...
...
@@ -816,10 +784,10 @@
" v_mul_f32 v241, v13, v241 row_newbcast:9
\n
"
" v_mul_f32 v242, v13, v242 row_newbcast:10
\n
"
" v_mul_f32 v243, v13, v243 row_newbcast:11
\n
"
" v_mul_f32 v240,
v20
, v240
\n
"
" v_mul_f32 v241,
v20
, v241
\n
"
" v_mul_f32 v242,
v20
, v242
\n
"
" v_mul_f32 v243,
v20
, v243
\n
"
" v_mul_f32 v240,
%[scale_0]
, v240
\n
"
" v_mul_f32 v241,
%[scale_0]
, v241
\n
"
" v_mul_f32 v242,
%[scale_0]
, v242
\n
"
" v_mul_f32 v243,
%[scale_0]
, v243
\n
"
" v_cvt_f32_i32 v244, v244
\n
"
" v_cvt_f32_i32 v245, v245
\n
"
" v_cvt_f32_i32 v246, v246
\n
"
...
...
@@ -832,10 +800,10 @@
" v_mul_f32 v245, v13, v245 row_newbcast:9
\n
"
" v_mul_f32 v246, v13, v246 row_newbcast:10
\n
"
" v_mul_f32 v247, v13, v247 row_newbcast:11
\n
"
" v_mul_f32 v244,
v21
, v244
\n
"
" v_mul_f32 v245,
v21
, v245
\n
"
" v_mul_f32 v246,
v21
, v246
\n
"
" v_mul_f32 v247,
v21
, v247
\n
"
" v_mul_f32 v244,
%[scale_1]
, v244
\n
"
" v_mul_f32 v245,
%[scale_1]
, v245
\n
"
" v_mul_f32 v246,
%[scale_1]
, v246
\n
"
" v_mul_f32 v247,
%[scale_1]
, v247
\n
"
" v_cvt_f32_i32 v248, v248
\n
"
" v_cvt_f32_i32 v249, v249
\n
"
" v_cvt_f32_i32 v250, v250
\n
"
...
...
@@ -848,10 +816,10 @@
" v_mul_f32 v249, v13, v249 row_newbcast:13
\n
"
" v_mul_f32 v250, v13, v250 row_newbcast:14
\n
"
" v_mul_f32 v251, v13, v251 row_newbcast:15
\n
"
" v_mul_f32 v248,
v20
, v248
\n
"
" v_mul_f32 v249,
v20
, v249
\n
"
" v_mul_f32 v250,
v20
, v250
\n
"
" v_mul_f32 v251,
v20
, v251
\n
"
" v_mul_f32 v248,
%[scale_0]
, v248
\n
"
" v_mul_f32 v249,
%[scale_0]
, v249
\n
"
" v_mul_f32 v250,
%[scale_0]
, v250
\n
"
" v_mul_f32 v251,
%[scale_0]
, v251
\n
"
" v_cvt_f32_i32 v252, v252
\n
"
" v_cvt_f32_i32 v253, v253
\n
"
" v_cvt_f32_i32 v254, v254
\n
"
...
...
@@ -864,10 +832,10 @@
" v_mul_f32 v253, v13, v253 row_newbcast:13
\n
"
" v_mul_f32 v254, v13, v254 row_newbcast:14
\n
"
" v_mul_f32 v255, v13, v255 row_newbcast:15
\n
"
" v_mul_f32 v252,
v21
, v252
\n
"
" v_mul_f32 v253,
v21
, v253
\n
"
" v_mul_f32 v254,
v21
, v254
\n
"
" v_mul_f32 v255,
v21
, v255
\n
"
" v_mul_f32 v252,
%[scale_1]
, v252
\n
"
" v_mul_f32 v253,
%[scale_1]
, v253
\n
"
" v_mul_f32 v254,
%[scale_1]
, v254
\n
"
" v_mul_f32 v255,
%[scale_1]
, v255
\n
"
" v_cmp_u_f32 s[48:49], v224, v224
\n
"
" v_add3_u32 v50, v224, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
...
...
@@ -1008,52 +976,52 @@
" ds_read_b32 v79, v4 offset:48224
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16
v80
, v64, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o0]
, v64, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16
v80
, v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o0]
, v65, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16
v82
, v66, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o1]
, v66, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16
v82
, v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o1]
, v67, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16
v84
, v68, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o3]
, v68, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16
v84
, v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o3]
, v69, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16
v86
, v70, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o4]
, v70, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16
v86
, v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o4]
, v71, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16
v88
, v72, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o5]
, v72, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16
v88
, v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o5]
, v73, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16
v90
, v74, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o6]
, v74, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16
v90
, v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o6]
, v75, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16
v92
, v76, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o7]
, v76, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16
v92
, v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o7]
, v77, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16
v94
, v78, [%[s_res_o0],%[s_res_o1]]
\n
"
" global_atomic_pk_add_bf16
%[v_os_o8]
, v78, [%[s_res_o0],%[s_res_o1]]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16
v94
, v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" global_atomic_pk_add_bf16
%[v_os_o8]
, v79, [%[s_res_o0],%[s_res_o1]] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_add_u32 %[s_res_o0], s59, %[s_res_o0]
\n
"
" s_addc_u32 %[s_res_o1], 0, %[s_res_o1]
\n
"
...
...
@@ -1068,3 +1036,5 @@
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
View file @
6f7d1272
...
...
@@ -5,36 +5,20 @@
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_INT8
# define _UK_MFMA_ "v_mfma_i32_16x16x32_i8"
#endif
# define _DEQUAN_CVT_(a0,a1,a2,a3,
b, c
) \
" v_cvt_f32_i32
a0, a0
\n
"
\
" v_cvt_f32_i32
a1, a1
\n
"
\
" v_cvt_f32_i32
a2, a2
\n
"
\
" v_cvt_f32_i32 a3
,
a3
\n
"
\
" v_mul_f32
a0, v15, a0
\n
"
\
" v_mul_f32
a1, v15, a1
\n
"
\
" v_mul_f32
a2, v15,
a2
\n
"
\
" v_mul_f32
a3, v15,
a3
\n
"
\
" v_mul_f32
a0, v17, a0
row_newbcast:
12
\n
"
\
" v_mul_f32
a1, v17,
a1 row_newbcast:1
3
\n
"
\
" v_mul_f32
a2, v17, a2
row_newbcast:
14
\n
"
\
" v_mul_f32
a3, v17, a3
row_newbcast:
15
\n
"
\
# define _DEQUAN_CVT_(a0,a1,a2,a3,
xq, gq,brd0,brd1,brd2,brd3
) \
" v_cvt_f32_i32
"
a0
", "
a0
"
\n
"
\
" v_cvt_f32_i32
"
a1
", "
a1
"
\n
"
\
" v_cvt_f32_i32
"
a2
", "
a2
"
\n
"
\
" v_cvt_f32_i32
"
a3
", "
a3
"
\n
"
\
" v_mul_f32
"
a0
", "
xq
", "
a0
"
\n
"
\
" v_mul_f32
"
a1
", "
xq
", "
a1
"
\n
"
\
" v_mul_f32
"
a2
", "
xq
", "
a2
"
\n
"
\
" v_mul_f32
"
a3
", "
xq
", "
a3
"
\n
"
\
" v_mul_f32
"
a0
", "
gq
", "
a0
"
row_newbcast:
"
brd0
"
\n
"
\
" v_mul_f32
"
a1
", "
gq
", "
a1
"
row_newbcast:
"
brd
1
"
\n
"
\
" v_mul_f32
"
a2
", "
gq
", "
a2
"
row_newbcast:
"
brd2
"
\n
"
\
" v_mul_f32
"
a3
", "
gq
", "
a3
"
row_newbcast:
"
brd3
"
\n
"
";----------------------------------------------
\n
"
" v_lshrrev_b32 v54, 4, v0
\n
"
" v_lshlrev_b32 v55, 2, v54
\n
"
" v_and_b32 v54, 15, v0
\n
"
" v_lshrrev_b32 v56, 2, v54
\n
"
" v_lshlrev_b32 v56, 6, v56
\n
"
" v_add_u32 v55, v56, v55
\n
"
" v_and_b32 v54, 3, v0
\n
"
" v_add_u32 v55, v54, v55
\n
"
" v_lshlrev_b32 v10, 2, v55
\n
"
" v_add_u32 v11, 0x00000400, v10
\n
"
" s_mul_i32 s60, %[s_wave_id], 16
\n
"
" s_mul_i32 s60, s60, 4
\n
"
" v_add_u32 v10, s60, v10
\n
"
" v_add_u32 v11, s60, v11
\n
"
" v_mov_b32 v5, v10
\n
"
";----------------------------------------------
\n
"
" s_mov_b32 s57, 0x00000100
\n
"
" s_mov_b32 s58, 0x00001000
\n
"
...
...
@@ -53,27 +37,22 @@
" v_mov_b32 v52, 0x7fff0000
\n
"
" v_mov_b32 v53, 0x00007fff
\n
"
" s_waitcnt 0x0000
\n
"
";----------------------------------------------
\n
"
" v_lshrrev_b32 v54, 24, %[v_token_id0]
\n
"
" v_mul_i32_i24 v54, s66, v54
\n
"
" v_and_b32 v55, 0x00ffffff, %[v_token_id0]
\n
"
" v_add_u32 %[v_token_id0], v54, v55
\n
"
" v_lshrrev_b32 v54, 24, %[v_token_id1]
\n
"
" v_mul_i32_i24 v54, s66, v54
\n
"
" v_and_b32 v55, 0x00ffffff, %[v_token_id1]
\n
"
" v_add_u32 %[v_token_id1], v54, v55
\n
"
" v_lshlrev_b32 %[v_token_id0], 2, %[v_token_id0]
\n
"
" v_lshlrev_b32 %[v_token_id1], 2, %[v_token_id1]
\n
"
" buffer_load_dword v14, %[v_token_id0], %[s_res_aq], 0 offen
\n
"
" buffer_load_dword v15, %[v_token_id1], %[s_res_aq], 0 offen
\n
"
" buffer_load_dword v16, v10, %[s_res_gq], 0 offen
\n
"
" buffer_load_dword v17, v11, %[s_res_gq], 0 offen
\n
"
" buffer_load_dword v18, v10, %[s_res_smq], 0 offen
\n
"
" buffer_load_dword v19, v11, %[s_res_smq], 0 offen
\n
"
" buffer_load_dword v20, v8, s[40:43], 0 offen
\n
"
" buffer_load_dword v21, v9, s[40:43], 0 offen
\n
"
" s_mov_b32 s80, 0
\n
"
" v_lshrrev_b32 v54, 4, v0
\n
"
" v_mul_i32_i24 v3, 34, v54
\n
"
" v_and_b32 v54, 15, v0
\n
"
" v_mul_i32_i24 v55, 2, v54
\n
"
" v_add_u32 v3, v55, v3
\n
"
" s_mul_i32 s60, s7, 0x00000088
\n
"
" v_add_u32 v3, s60, v3
\n
"
" v_lshlrev_b32 v3, 2, v3
\n
"
" v_lshrrev_b32 v54, 1, v0
\n
"
" v_mul_i32_i24 v4, 34, v54
\n
"
" v_and_b32 v55, 1, v0
\n
"
" v_add_u32 v4, v55, v4
\n
"
" s_mul_i32 s60, s7, 2
\n
"
" v_add_u32 v4, s60, v4
\n
"
" v_lshlrev_b32 v4, 2, v4
\n
"
";----------------------------------------------
\n
"
"; -- prefetch A0
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
...
...
@@ -570,198 +549,23 @@
" s_branch label_start
\n
"
" label_end :
\n
"
";----------------------------------------------
\n
"
" v_cvt_f32_i32 v128, v128
\n
"
" v_cvt_f32_i32 v129, v129
\n
"
" v_cvt_f32_i32 v130, v130
\n
"
" v_cvt_f32_i32 v131, v131
\n
"
" v_mul_f32 v128, v14, v128
\n
"
" v_mul_f32 v129, v14, v129
\n
"
" v_mul_f32 v130, v14, v130
\n
"
" v_mul_f32 v131, v14, v131
\n
"
" v_mul_f32 v128, v16, v128 row_newbcast:0
\n
"
" v_mul_f32 v129, v16, v129 row_newbcast:1
\n
"
" v_mul_f32 v130, v16, v130 row_newbcast:2
\n
"
" v_mul_f32 v131, v16, v131 row_newbcast:3
\n
"
" v_cvt_f32_i32 v132, v132
\n
"
" v_cvt_f32_i32 v133, v133
\n
"
" v_cvt_f32_i32 v134, v134
\n
"
" v_cvt_f32_i32 v135, v135
\n
"
" v_mul_f32 v132, v15, v132
\n
"
" v_mul_f32 v133, v15, v133
\n
"
" v_mul_f32 v134, v15, v134
\n
"
" v_mul_f32 v135, v15, v135
\n
"
" v_mul_f32 v132, v16, v132 row_newbcast:0
\n
"
" v_mul_f32 v133, v16, v133 row_newbcast:1
\n
"
" v_mul_f32 v134, v16, v134 row_newbcast:2
\n
"
" v_mul_f32 v135, v16, v135 row_newbcast:3
\n
"
" v_cvt_f32_i32 v136, v136
\n
"
" v_cvt_f32_i32 v137, v137
\n
"
" v_cvt_f32_i32 v138, v138
\n
"
" v_cvt_f32_i32 v139, v139
\n
"
" v_mul_f32 v136, v14, v136
\n
"
" v_mul_f32 v137, v14, v137
\n
"
" v_mul_f32 v138, v14, v138
\n
"
" v_mul_f32 v139, v14, v139
\n
"
" v_mul_f32 v136, v16, v136 row_newbcast:4
\n
"
" v_mul_f32 v137, v16, v137 row_newbcast:5
\n
"
" v_mul_f32 v138, v16, v138 row_newbcast:6
\n
"
" v_mul_f32 v139, v16, v139 row_newbcast:7
\n
"
" v_cvt_f32_i32 v140, v140
\n
"
" v_cvt_f32_i32 v141, v141
\n
"
" v_cvt_f32_i32 v142, v142
\n
"
" v_cvt_f32_i32 v143, v143
\n
"
" v_mul_f32 v140, v15, v140
\n
"
" v_mul_f32 v141, v15, v141
\n
"
" v_mul_f32 v142, v15, v142
\n
"
" v_mul_f32 v143, v15, v143
\n
"
" v_mul_f32 v140, v16, v140 row_newbcast:4
\n
"
" v_mul_f32 v141, v16, v141 row_newbcast:5
\n
"
" v_mul_f32 v142, v16, v142 row_newbcast:6
\n
"
" v_mul_f32 v143, v16, v143 row_newbcast:7
\n
"
" v_cvt_f32_i32 v144, v144
\n
"
" v_cvt_f32_i32 v145, v145
\n
"
" v_cvt_f32_i32 v146, v146
\n
"
" v_cvt_f32_i32 v147, v147
\n
"
" v_mul_f32 v144, v14, v144
\n
"
" v_mul_f32 v145, v14, v145
\n
"
" v_mul_f32 v146, v14, v146
\n
"
" v_mul_f32 v147, v14, v147
\n
"
" v_mul_f32 v144, v16, v144 row_newbcast:8
\n
"
" v_mul_f32 v145, v16, v145 row_newbcast:9
\n
"
" v_mul_f32 v146, v16, v146 row_newbcast:10
\n
"
" v_mul_f32 v147, v16, v147 row_newbcast:11
\n
"
" v_cvt_f32_i32 v148, v148
\n
"
" v_cvt_f32_i32 v149, v149
\n
"
" v_cvt_f32_i32 v150, v150
\n
"
" v_cvt_f32_i32 v151, v151
\n
"
" v_mul_f32 v148, v15, v148
\n
"
" v_mul_f32 v149, v15, v149
\n
"
" v_mul_f32 v150, v15, v150
\n
"
" v_mul_f32 v151, v15, v151
\n
"
" v_mul_f32 v148, v16, v148 row_newbcast:8
\n
"
" v_mul_f32 v149, v16, v149 row_newbcast:9
\n
"
" v_mul_f32 v150, v16, v150 row_newbcast:10
\n
"
" v_mul_f32 v151, v16, v151 row_newbcast:11
\n
"
" v_cvt_f32_i32 v152, v152
\n
"
" v_cvt_f32_i32 v153, v153
\n
"
" v_cvt_f32_i32 v154, v154
\n
"
" v_cvt_f32_i32 v155, v155
\n
"
" v_mul_f32 v152, v14, v152
\n
"
" v_mul_f32 v153, v14, v153
\n
"
" v_mul_f32 v154, v14, v154
\n
"
" v_mul_f32 v155, v14, v155
\n
"
" v_mul_f32 v152, v16, v152 row_newbcast:12
\n
"
" v_mul_f32 v153, v16, v153 row_newbcast:13
\n
"
" v_mul_f32 v154, v16, v154 row_newbcast:14
\n
"
" v_mul_f32 v155, v16, v155 row_newbcast:15
\n
"
" v_cvt_f32_i32 v156, v156
\n
"
" v_cvt_f32_i32 v157, v157
\n
"
" v_cvt_f32_i32 v158, v158
\n
"
" v_cvt_f32_i32 v159, v159
\n
"
" v_mul_f32 v156, v15, v156
\n
"
" v_mul_f32 v157, v15, v157
\n
"
" v_mul_f32 v158, v15, v158
\n
"
" v_mul_f32 v159, v15, v159
\n
"
" v_mul_f32 v156, v16, v156 row_newbcast:12
\n
"
" v_mul_f32 v157, v16, v157 row_newbcast:13
\n
"
" v_mul_f32 v158, v16, v158 row_newbcast:14
\n
"
" v_mul_f32 v159, v16, v159 row_newbcast:15
\n
"
" v_cvt_f32_i32 v160, v160
\n
"
" v_cvt_f32_i32 v161, v161
\n
"
" v_cvt_f32_i32 v162, v162
\n
"
" v_cvt_f32_i32 v163, v163
\n
"
" v_mul_f32 v160, v14, v160
\n
"
" v_mul_f32 v161, v14, v161
\n
"
" v_mul_f32 v162, v14, v162
\n
"
" v_mul_f32 v163, v14, v163
\n
"
" v_mul_f32 v160, v17, v160 row_newbcast:0
\n
"
" v_mul_f32 v161, v17, v161 row_newbcast:1
\n
"
" v_mul_f32 v162, v17, v162 row_newbcast:2
\n
"
" v_mul_f32 v163, v17, v163 row_newbcast:3
\n
"
" v_cvt_f32_i32 v164, v164
\n
"
" v_cvt_f32_i32 v165, v165
\n
"
" v_cvt_f32_i32 v166, v166
\n
"
" v_cvt_f32_i32 v167, v167
\n
"
" v_mul_f32 v164, v15, v164
\n
"
" v_mul_f32 v165, v15, v165
\n
"
" v_mul_f32 v166, v15, v166
\n
"
" v_mul_f32 v167, v15, v167
\n
"
" v_mul_f32 v164, v17, v164 row_newbcast:0
\n
"
" v_mul_f32 v165, v17, v165 row_newbcast:1
\n
"
" v_mul_f32 v166, v17, v166 row_newbcast:2
\n
"
" v_mul_f32 v167, v17, v167 row_newbcast:3
\n
"
" v_cvt_f32_i32 v168, v168
\n
"
" v_cvt_f32_i32 v169, v169
\n
"
" v_cvt_f32_i32 v170, v170
\n
"
" v_cvt_f32_i32 v171, v171
\n
"
" v_mul_f32 v168, v14, v168
\n
"
" v_mul_f32 v169, v14, v169
\n
"
" v_mul_f32 v170, v14, v170
\n
"
" v_mul_f32 v171, v14, v171
\n
"
" v_mul_f32 v168, v17, v168 row_newbcast:4
\n
"
" v_mul_f32 v169, v17, v169 row_newbcast:5
\n
"
" v_mul_f32 v170, v17, v170 row_newbcast:6
\n
"
" v_mul_f32 v171, v17, v171 row_newbcast:7
\n
"
" v_cvt_f32_i32 v172, v172
\n
"
" v_cvt_f32_i32 v173, v173
\n
"
" v_cvt_f32_i32 v174, v174
\n
"
" v_cvt_f32_i32 v175, v175
\n
"
" v_mul_f32 v172, v15, v172
\n
"
" v_mul_f32 v173, v15, v173
\n
"
" v_mul_f32 v174, v15, v174
\n
"
" v_mul_f32 v175, v15, v175
\n
"
" v_mul_f32 v172, v17, v172 row_newbcast:4
\n
"
" v_mul_f32 v173, v17, v173 row_newbcast:5
\n
"
" v_mul_f32 v174, v17, v174 row_newbcast:6
\n
"
" v_mul_f32 v175, v17, v175 row_newbcast:7
\n
"
" v_cvt_f32_i32 v176, v176
\n
"
" v_cvt_f32_i32 v177, v177
\n
"
" v_cvt_f32_i32 v178, v178
\n
"
" v_cvt_f32_i32 v179, v179
\n
"
" v_mul_f32 v176, v14, v176
\n
"
" v_mul_f32 v177, v14, v177
\n
"
" v_mul_f32 v178, v14, v178
\n
"
" v_mul_f32 v179, v14, v179
\n
"
" v_mul_f32 v176, v17, v176 row_newbcast:8
\n
"
" v_mul_f32 v177, v17, v177 row_newbcast:9
\n
"
" v_mul_f32 v178, v17, v178 row_newbcast:10
\n
"
" v_mul_f32 v179, v17, v179 row_newbcast:11
\n
"
" v_cvt_f32_i32 v180, v180
\n
"
" v_cvt_f32_i32 v181, v181
\n
"
" v_cvt_f32_i32 v182, v182
\n
"
" v_cvt_f32_i32 v183, v183
\n
"
" v_mul_f32 v180, v15, v180
\n
"
" v_mul_f32 v181, v15, v181
\n
"
" v_mul_f32 v182, v15, v182
\n
"
" v_mul_f32 v183, v15, v183
\n
"
" v_mul_f32 v180, v17, v180 row_newbcast:8
\n
"
" v_mul_f32 v181, v17, v181 row_newbcast:9
\n
"
" v_mul_f32 v182, v17, v182 row_newbcast:10
\n
"
" v_mul_f32 v183, v17, v183 row_newbcast:11
\n
"
" v_cvt_f32_i32 v184, v184
\n
"
" v_cvt_f32_i32 v185, v185
\n
"
" v_cvt_f32_i32 v186, v186
\n
"
" v_cvt_f32_i32 v187, v187
\n
"
" v_mul_f32 v184, v14, v184
\n
"
" v_mul_f32 v185, v14, v185
\n
"
" v_mul_f32 v186, v14, v186
\n
"
" v_mul_f32 v187, v14, v187
\n
"
" v_mul_f32 v184, v17, v184 row_newbcast:12
\n
"
" v_mul_f32 v185, v17, v185 row_newbcast:13
\n
"
" v_mul_f32 v186, v17, v186 row_newbcast:14
\n
"
" v_mul_f32 v187, v17, v187 row_newbcast:15
\n
"
" v_cvt_f32_i32 v188, v188
\n
"
" v_cvt_f32_i32 v189, v189
\n
"
" v_cvt_f32_i32 v190, v190
\n
"
" v_cvt_f32_i32 v191, v191
\n
"
" v_mul_f32 v188, v15, v188
\n
"
" v_mul_f32 v189, v15, v189
\n
"
" v_mul_f32 v190, v15, v190
\n
"
" v_mul_f32 v191, v15, v191
\n
"
" v_mul_f32 v188, v17, v188 row_newbcast:12
\n
"
" v_mul_f32 v189, v17, v189 row_newbcast:13
\n
"
" v_mul_f32 v190, v17, v190 row_newbcast:14
\n
"
" v_mul_f32 v191, v17, v191 row_newbcast:15
\n
"
_DEQUAN_CVT_
(
"%[c0]"
,
"%[c1]"
,
"%[c2]"
,
"%[c3]"
,
"%[a_scale0]"
,
" %[gq_scale0]"
,
"0"
,
"1"
,
"2"
,
"3"
)
_DEQUAN_CVT_
(
"%[c4]"
,
"%[c5]"
,
"%[c6]"
,
"%[c7]"
,
"%[a_scale1]"
,
" %[gq_scale0]"
,
"0"
,
"1"
,
"2"
,
"3"
)
_DEQUAN_CVT_
(
"%[c8]"
,
"%[c9]"
,
"%[c10]"
,
"%[c11]"
,
"%[a_scale0]"
,
" %[gq_scale0]"
,
"4"
,
"5"
,
"6"
,
"7"
)
_DEQUAN_CVT_
(
"%[c12]"
,
"%[c13]"
,
"%[c14]"
,
"%[c15]"
,
"%[a_scale1]"
,
" %[gq_scale0]"
,
"4"
,
"5"
,
"6"
,
"7"
)
_DEQUAN_CVT_
(
"%[c16]"
,
"%[c17]"
,
"%[c18]"
,
"%[c19]"
,
"%[a_scale0]"
,
" %[gq_scale0]"
,
"8"
,
"9"
,
"10"
,
"11"
)
_DEQUAN_CVT_
(
"%[c20]"
,
"%[c21]"
,
"%[c22]"
,
"%[c23]"
,
"%[a_scale1]"
,
" %[gq_scale0]"
,
"8"
,
"9"
,
"10"
,
"11"
)
_DEQUAN_CVT_
(
"%[c24]"
,
"%[c25]"
,
"%[c26]"
,
"%[c27]"
,
"%[a_scale0]"
,
" %[gq_scale0]"
,
"12"
,
"13"
,
"14"
,
"15"
)
_DEQUAN_CVT_
(
"%[c28]"
,
"%[c29]"
,
"%[c30]"
,
"%[c31]"
,
"%[a_scale1]"
,
" %[gq_scale0]"
,
"12"
,
"13"
,
"14"
,
"15"
)
_DEQUAN_CVT_
(
"%[c32]"
,
"%[c33]"
,
"%[c34]"
,
"%[c35]"
,
"%[a_scale0]"
,
" %[gq_scale1]"
,
"0"
,
"1"
,
"2"
,
"3"
)
_DEQUAN_CVT_
(
"%[c36]"
,
"%[c37]"
,
"%[c38]"
,
"%[c39]"
,
"%[a_scale1]"
,
" %[gq_scale1]"
,
"0"
,
"1"
,
"2"
,
"3"
)
_DEQUAN_CVT_
(
"%[c40]"
,
"%[c41]"
,
"%[c42]"
,
"%[c43]"
,
"%[a_scale0]"
,
" %[gq_scale1]"
,
"4"
,
"5"
,
"6"
,
"7"
)
_DEQUAN_CVT_
(
"%[c44]"
,
"%[c45]"
,
"%[c46]"
,
"%[c47]"
,
"%[a_scale1]"
,
" %[gq_scale1]"
,
"4"
,
"5"
,
"6"
,
"7"
)
_DEQUAN_CVT_
(
"%[c48]"
,
"%[c49]"
,
"%[c50]"
,
"%[c51]"
,
"%[a_scale0]"
,
" %[gq_scale1]"
,
"8"
,
"9"
,
"10"
,
"11"
)
_DEQUAN_CVT_
(
"%[c52]"
,
"%[c53]"
,
"%[c54]"
,
"%[c55]"
,
"%[a_scale1]"
,
" %[gq_scale1]"
,
"8"
,
"9"
,
"10"
,
"11"
)
_DEQUAN_CVT_
(
"%[c56]"
,
"%[c57]"
,
"%[c58]"
,
"%[c59]"
,
"%[a_scale0]"
,
" %[gq_scale1]"
,
"12"
,
"13"
,
"14"
,
"15"
)
_DEQUAN_CVT_
(
"%[c60]"
,
"%[c61]"
,
"%[c62]"
,
"%[c63]"
,
"%[a_scale1]"
,
" %[gq_scale1]"
,
"12"
,
"13"
,
"14"
,
"15"
)
#undef _UK_MFMA_
#undef _DEQUAN_CVT_
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp
View file @
6f7d1272
...
...
@@ -186,6 +186,50 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
return
coords
;
}
// TODO: this row id is before shuffle atomic, need use acc distribution
//this calculation shared by G and SMQ
CK_TILE_DEVICE
auto
GetColCoords_GQSMQ
(
index_t
base_offset
)
{
constexpr
index_t
MLanes
=
BlockShape
::
Warp_M1
;
constexpr
index_t
Repeat_N
=
2
;
//different,this load is partitioned along N
// auto h_id = threadIdx.x / MLanes ;
// auto r_id = threadIdx.x & 0xffff;
// auto p_id = r_id/4;
// auto q_is = threadIdx.x & 0x3;
array
<
index_t
,
Repeat_N
>
coords
;
static_for
<
0
,
Repeat_N
,
1
>
{}([
&
](
auto
i
)
{
coords
.
at
(
i
)
=
base_coord
+
(
threadIdx
.
x
/
MLanes
)
*
4
+
(
threadIdx
.
x
&
0xffff
)
/
4
*
64
+
q_id
+
i
*
256
;
});
return
coords
;
}
//this calculation shared by G and SMQ
CK_TILE_DEVICE
auto
GetGQScale
(
const
COL_IDS
coords
,
const
GScaleDataType
*
g_scale_ptr
)
{
constexpr
index_t
n_size
=
coords
.
size
();
array
<
GScaleDataType
,
n_size
>
g_scale_value
;
static_for
<
0
,
n_size
,
1
>
{}([
&
](
auto
i
)
{
g_scale_value
.
at
(
i
)
=
g_scale_ptr
[
coords
[
i
]];
});
return
g_scale_value
;
}
CK_TILE_DEVICE
auto
GetSMQScale
(
const
COL_IDS
coords
,
const
YSmoothScaleDataType
*
y_scale_ptr
)
{
constexpr
index_t
n_size
=
coords
.
size
();
array
<
YSmoothScaleDataType
,
n_size
>
y_scale_value
;
static_for
<
0
,
n_size
,
1
>
{}([
&
](
auto
i
)
{
y_scale_value
.
at
(
i
)
=
y_scale_ptr
[
coords
[
i
]];
});
return
y_scale_value
;
}
template
<
typename
Karg
>
CK_TILE_DEVICE
auto
operator
()(
const
Karg
&
kargs
,
...
...
@@ -230,12 +274,6 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
return
(
row_ids_a
[
i
])
&
0xffffff
;
},
number
<
row_ids_a
.
size
()
>
{});
// auto token_id_mma = generate_tuple(
// [&](auto i) {
// return (row_ids_a_mma[i]) &0xffffff;
// },
// number<row_ids_a_mma.size()>{});
//addr in fact
auto
a_coords
=
generate_tuple
(
[
&
](
auto
i
)
{
return
((
row_ids_a
[
i
])
&
0xffffff
)
*
kargs
.
stride_token
+
...
...
@@ -306,7 +344,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto
smq_win
=
[
&
]()
{
const
YSmoothScaleDataType
*
smq_ptr
=
reinterpret_cast
<
const
YSmoothScaleDataType
*>
(
kargs
.
y_smooth_scale_ptr
)
+
static_cast
<
long_index_t
>
(
expert_id
)
*
smq_scale_expert_stride_0
+
intermediate_tile_id
*
BlockShape
::
Block_
N0
;
intermediate_tile_id
*
BlockShape
::
Block_
K1
;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.g_scale_ptr);//remember to add expert id for inline
auto
smq_view_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
smq_ptr
,
...
...
@@ -346,15 +384,15 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto
d_res
=
d_win
.
get_bottom_tensor_view
().
get_buffer_view
().
cached_buf_res_
;
//////gq
auto
dq_win
=
[
&
]()
{
const
DScaleDataType
*
g
_ptr
=
reinterpret_cast
<
const
DScaleDataType
*>
(
kargs
.
d_scale_ptr
)
+
static_cast
<
long_index_t
>
(
expert_id
)
*
d_scale_expert_stride_1
;
const
DScaleDataType
*
dq
_ptr
=
reinterpret_cast
<
const
DScaleDataType
*>
(
kargs
.
d_scale_ptr
)
+
static_cast
<
long_index_t
>
(
expert_id
)
*
d_scale_expert_stride_1
;
// const GDataType* g_ptr = reinterpret_cast<const GScaleDataType*>(kargs.d_scale_ptr)//remember to add expert_id as expert_idx
auto
g
_view_
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
g
_ptr
,
auto
dq
_view_
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
dq
_ptr
,
make_tuple
(
kargs
.
hidden_size
),
number
<
1
>
{});
return
g
_view_
;
return
dq
_view_
;
}();
auto
dq_res
=
dq_win
.
get_buffer_view
().
cached_buf_res_
;
...
...
@@ -400,15 +438,6 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
generate_tuple
([
&
](
auto
i
)
{
return
cmp_lt_to_exec
(
token_id
[
i
],
kargs
.
num_tokens
);
},
number
<
row_ids_a
.
size
()
>
{});
// auto bridge_sst_win = [&]() {
// constexpr auto desc_ = Policy::template MakeBridgeLdsStoreForUKDesc<Problem>();
// constexpr auto dist_ = Policy::template GetUK_0<Problem>().MakeCBlockDist();
// return make_tile_window_linear(make_tensor_view<address_space_enum::lds>(
// reinterpret_cast<YDataType*>(smem), desc_),
// desc_.get_lengths(),
// {0, 0},
// dist_);
// }();
auto
o_res
=
make_wave_buffer_resource
(
reinterpret_cast
<
const
ODataType
*>
(
kargs
.
o_ptr
),
kargs
.
num_tokens
*
kargs
.
stride_token
*
sizeof
(
ODataType
));
...
...
@@ -417,16 +446,17 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto
w_scale
=
GetWeightScale
(
row_coords_o
,
reinterpret_cast
<
const
TopkWeightDataType
*>
(
kargs
.
sorted_weight_ptr
));
auto
a_scale
=
GetAScale
(
row_coords_o
,
reinterpret_cast
<
const
TopkWeightDataType
*>
(
kargs
.
a_scale_ptr
));
row_ids_a_mma
,
reinterpret_cast
<
const
AScaleDataType
*>
(
kargs
.
a_scale_ptr
));
auto
gqsmq_coords
=
GetColCoords_GQSMQ
(
intermediated_tile_id
*
BlockShape
::
Block_K1
);
auto
dq_coords
=
gqsmq_coords
[
0
];
//only one for this tiling
auto
gq_scale
=
GetGQScale
(
gqsmq_coords
,
reinterpret_cast
<
const
GScaleDataType
*>
(
kargs
.
g_scale_ptr
+
static_cast
<
long_index_t
>
(
expert_id
)
*
shared_intermediate_size_0
));
auto
smq_scale
=
GetSMQScale
(
gqsmq_coords
,
reinterpret_cast
<
const
YSmoothScaleDataType
*>
(
kargs
.
y_smooth_scale_ptr
+
static_cast
<
long_index_t
>
(
expert_id
)
*
shared_intermediate_size_0
));
auto
uk_0
=
Policy
::
template
GetUK_0
<
Problem
>();
// auto acc_0= uk_0(
uk_0
(
row_ids_a_mma
,
//fake token id, 2D index for X scale
a_scale
,
dq_res
,
gq_res
,
smq_res
,
uk_0
(
a_scale
,
gq_scale
,
a_res
,
a_coords
,
g_res
,
...
...
@@ -457,6 +487,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto
uk_1
=
Policy
::
template
GetUK_1
<
Problem
>();
uk_1
(
dq_res
,
d_res
,
dq_coords
,
d_coords
,
o_res
,
o_coords
,
...
...
@@ -464,6 +495,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
smem
,
kargs
.
hidden_size
,
// total n number
w_scale
,
smq_scale
,
BlockShape
::
Block_N1
,
shared_intermediate_size_1
*
BlockShape
::
Block_N1
-
kr_1
*
BlockShape
::
Block_W1
,
// along N
kr_1
*
BlockShape
::
Block_W1
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment