Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
a759277d
Commit
a759277d
authored
Jan 02, 2025
by
shengnxu
Browse files
fix some error
parent
f549173b
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
2431 additions
and
614 deletions
+2431
-614
example/ck_tile/15_fused_moe/main.cpp
example/ck_tile/15_fused_moe/main.cpp
+14
-8
include/ck_tile/host/reference/reference_fused_moe.hpp
include/ck_tile/host/reference/reference_fused_moe.hpp
+34
-11
include/ck_tile/ops/flatmm.hpp
include/ck_tile/ops/flatmm.hpp
+2
-0
include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
...ps/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
+119
-270
include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp
...flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp
+185
-260
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc
...uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc
+947
-0
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc
...uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc
+1070
-0
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
...lock/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
+53
-60
include/ck_tile/ops/fused_moe.hpp
include/ck_tile/ops/fused_moe.hpp
+1
-0
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp
...ed_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp
+4
-3
include/ck_tile/ops/gemm/warp/warp_gemm.hpp
include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+1
-1
include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
...e/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+1
-1
No files found.
example/ck_tile/15_fused_moe/main.cpp
View file @
a759277d
...
@@ -97,14 +97,14 @@ auto create_args(int argc, char* argv[])
...
@@ -97,14 +97,14 @@ auto create_args(int argc, char* argv[])
.
insert
(
"tp"
,
"8"
,
"tensor parallel size"
)
.
insert
(
"tp"
,
"8"
,
"tensor parallel size"
)
.
insert
(
"v"
,
"1"
,
"cpu validation or not"
)
.
insert
(
"v"
,
"1"
,
"cpu validation or not"
)
.
insert
(
"kname"
,
"1"
,
"print kernel name or not"
)
.
insert
(
"kname"
,
"1"
,
"print kernel name or not"
)
.
insert
(
"prec_i"
,
"
bf16
"
,
"input precision"
)
.
insert
(
"prec_i"
,
"
int8
"
,
"input precision"
)
.
insert
(
"prec_w"
,
"
bf16
"
,
"weight precision"
)
.
insert
(
"prec_w"
,
"
int8
"
,
"weight precision"
)
.
insert
(
"prec_o"
,
"bf16"
,
"output precision"
)
.
insert
(
"prec_o"
,
"bf16"
,
"output precision"
)
.
insert
(
"prec_st"
,
"auto"
,
"token scale data type. auto will set to fp32"
)
.
insert
(
"prec_st"
,
"auto"
,
"token scale data type. auto will set to fp32"
)
.
insert
(
"prec_sw"
,
"auto"
,
"weight scale data type. auto will set to fp32"
)
.
insert
(
"prec_sw"
,
"auto"
,
"weight scale data type. auto will set to fp32"
)
.
insert
(
"prec_sq"
,
"auto"
,
"(dynamic) smooth quant data type. auto will set to fp32"
)
.
insert
(
"prec_sq"
,
"auto"
,
"(dynamic) smooth quant data type. auto will set to fp32"
)
.
insert
(
"prec_kw"
,
"auto"
,
"topk-weight data type. auto will set to fp32"
)
.
insert
(
"prec_kw"
,
"auto"
,
"topk-weight data type. auto will set to fp32"
)
.
insert
(
"fquant"
,
"
0
"
,
"fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant"
)
.
insert
(
"fquant"
,
"
1
"
,
"fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant"
)
.
insert
(
.
insert
(
"gate_only"
,
"1"
,
"w0(gate/up) style, 0:gate+up will double interm size, 1:only gate"
)
"gate_only"
,
"1"
,
"w0(gate/up) style, 0:gate+up will double interm size, 1:only gate"
)
.
insert
(
"api"
,
"0"
,
"benchmark api set: 0:fused-moe(moe-gemm+moe-sorting), 1:moe-gemm"
)
.
insert
(
"api"
,
"0"
,
"benchmark api set: 0:fused-moe(moe-gemm+moe-sorting), 1:moe-gemm"
)
...
@@ -218,10 +218,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
...
@@ -218,10 +218,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
ck_tile
::
HostTensor
<
GDataType
>
g_host
({
experts
,
shared_intermediate_size_0
,
hidden_size
});
ck_tile
::
HostTensor
<
GDataType
>
g_host
({
experts
,
shared_intermediate_size_0
,
hidden_size
});
ck_tile
::
HostTensor
<
DDataType
>
d_host
({
experts
,
hidden_size
,
shared_intermediate_size_1
});
ck_tile
::
HostTensor
<
DDataType
>
d_host
({
experts
,
hidden_size
,
shared_intermediate_size_1
});
ck_tile
::
HostTensor
<
ODataType
>
o_host
({
tokens
,
hidden_size
},
{
stride
,
1
});
ck_tile
::
HostTensor
<
ODataType
>
o_host
({
tokens
,
hidden_size
},
{
stride
,
1
});
ck_tile
::
HostTensor
<
AScaleDataType
>
sa_host
({
tokens
});
if
(
fused_quant
==
1
)
ck_tile
::
HostTensor
<
GScaleDataType
>
sg_host
({
shared_intermediate_size_0
});
{
ck_tile
::
HostTensor
<
DScaleDataType
>
sd_host
({
shared_intermediate_size_1
});
ck_tile
::
HostTensor
<
AScaleDataType
>
sa_host
({
tokens
,
topk
});
ck_tile
::
HostTensor
<
YSmoothScaleDataType
>
sy_host
({
shared_intermediate_size_1
});
// smooth-quant
}
else
{
ck_tile
::
HostTensor
<
AScaleDataType
>
sa_host
({
tokens
});
}
ck_tile
::
HostTensor
<
GScaleDataType
>
sg_host
({
experts
,
shared_intermediate_size_0
});
ck_tile
::
HostTensor
<
DScaleDataType
>
sd_host
({
experts
,
shared_intermediate_size_1
});
ck_tile
::
HostTensor
<
YSmoothScaleDataType
>
sy_host
({
experts
,
shared_intermediate_size_1
});
// smooth-quant
ck_tile
::
HostTensor
<
IndexDataType
>
topk_ids_host
({
tokens
,
topk
});
// to be sort
ck_tile
::
HostTensor
<
IndexDataType
>
topk_ids_host
({
tokens
,
topk
});
// to be sort
ck_tile
::
HostTensor
<
TopkWeightDataType
>
topk_weight_host
({
tokens
,
topk
});
// to be sort
ck_tile
::
HostTensor
<
TopkWeightDataType
>
topk_weight_host
({
tokens
,
topk
});
// to be sort
...
@@ -440,7 +445,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
...
@@ -440,7 +445,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
hidden_size
,
hidden_size
,
shared_intermediate_size_0
,
shared_intermediate_size_0
,
topk
,
topk
,
gate_only
);
gate_only
,
fused_quant
);
auto
o_dev
=
o_buf
.
ToHost
<
ODataType
>
();
auto
o_dev
=
o_buf
.
ToHost
<
ODataType
>
();
// o_dev.savetxt("gpu-out.txt", "float");
// o_dev.savetxt("gpu-out.txt", "float");
...
...
include/ck_tile/host/reference/reference_fused_moe.hpp
View file @
a759277d
...
@@ -75,7 +75,8 @@ void reference_fused_moe(
...
@@ -75,7 +75,8 @@ void reference_fused_moe(
ck_tile
::
index_t
hidden_size
,
ck_tile
::
index_t
hidden_size
,
ck_tile
::
index_t
intermediate_size
,
// this size is for gate/up
ck_tile
::
index_t
intermediate_size
,
// this size is for gate/up
ck_tile
::
index_t
topk
,
ck_tile
::
index_t
topk
,
ck_tile
::
index_t
gate_only
)
ck_tile
::
index_t
gate_only
,
ck_tile
::
index_t
fquant
)
{
{
assert
(
sorted_token_ids_host
.
get_num_of_dimension
()
==
1
);
assert
(
sorted_token_ids_host
.
get_num_of_dimension
()
==
1
);
assert
(
sorted_weight_host
.
get_num_of_dimension
()
==
1
);
assert
(
sorted_weight_host
.
get_num_of_dimension
()
==
1
);
...
@@ -106,22 +107,40 @@ void reference_fused_moe(
...
@@ -106,22 +107,40 @@ void reference_fused_moe(
return
;
return
;
ck_tile
::
index_t
i_expert
=
sorted_expert_ids_host
.
mData
[
i_tile
];
ck_tile
::
index_t
i_expert
=
sorted_expert_ids_host
.
mData
[
i_tile
];
ck_tile
::
index_t
i_token
=
sorted_token_ids_host
.
mData
[
i_flatten
];
ck_tile
::
index_t
i_token
=
sorted_token_ids_host
.
mData
[
i_flatten
];
if
(
i_token
>=
tokens
)
ck_tile
::
index_t
i_weight_idx
;
if
(
fquant
==
1
)
{
i_weight_idx
=
i_token
>>
24
;
i_token
=
i_token
&
0xffffff
;
}
if
(
i_token
>=
tokens
)
return
;
return
;
ck_tile
::
index_t
i_topk
=
get_topk_id
(
i_token
,
i_expert
);
// TODO: ugly
ck_tile
::
index_t
i_topk
=
get_topk_id
(
i_token
,
i_expert
);
// TODO: ugly
auto
weight
=
sorted_weight_host
.
mData
[
i_flatten
];
auto
weight
=
sorted_weight_host
.
mData
[
i_flatten
];
//top k ratio?
ck_tile
::
HostTensor
<
AccDataType
>
acc_0
({
1
,
intermediate_size_0
});
ck_tile
::
HostTensor
<
float
>
acc_0
({
1
,
intermediate_size_0
});
// first gemm
// first gemm
for
(
ck_tile
::
index_t
i_n
=
0
;
i_n
<
intermediate_size_0
;
i_n
++
)
for
(
ck_tile
::
index_t
i_n
=
0
;
i_n
<
intermediate_size_0
;
i_n
++
)
{
{
AccDataType
acc
=
static_cast
<
AccDataType
>
(
0
);
AccDataType
acc
=
static_cast
<
AccDataType
>
(
0
);
for
(
ck_tile
::
index_t
i_k
=
0
;
i_k
<
hidden_size
;
i_k
++
)
for
(
ck_tile
::
index_t
i_k
=
0
;
i_k
<
hidden_size
;
i_k
++
)
{
{
acc
+=
type_convert
<
AccDataType
>
(
a_host
(
i_token
,
i_k
))
*
acc
+=
type_convert
<
float
>
(
a_host
(
i_token
,
i_k
))
*
type_convert
<
AccDataType
>
(
g_host
(
i_expert
,
i_n
,
i_k
));
type_convert
<
float
>
(
g_host
(
i_expert
,
i_n
,
i_k
));
}
if
(
fquant
==
1
)
{
//smooth
acc_0
(
0
,
i_n
)
=
acc
*
sa_host
(
i_token
,
i_weight_idx
)
*
sg_host
(
i_expert
,
i_n
);
}
else
if
(
fquant
==
2
)
{
//dynamic
acc_0
(
0
,
i_n
)
=
acc
*
sa_host
(
i_token
)
*
sg_host
(
i_expert
,
i_n
);
}
else
{
//no quant
acc_0
(
0
,
i_n
)
=
acc
;
}
}
acc_0
(
0
,
i_n
)
=
acc
;
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
// printf("ie:%2d, it:%3d, in:%d, %f\n", i_expert, i_token, i_n, acc);
}
}
...
@@ -158,10 +177,14 @@ void reference_fused_moe(
...
@@ -158,10 +177,14 @@ void reference_fused_moe(
{
{
AccDataType
acc
=
static_cast
<
AccDataType
>
(
0
);
AccDataType
acc
=
static_cast
<
AccDataType
>
(
0
);
for
(
ck_tile
::
index_t
i_k
=
0
;
i_k
<
intermediate_size_1
;
i_k
++
)
for
(
ck_tile
::
index_t
i_k
=
0
;
i_k
<
intermediate_size_1
;
i_k
++
)
{
{
if
(
fquant
==
1
)
acc
+=
y
(
0
,
i_k
)
*
type_convert
<
AccDataType
>
(
d_host
(
i_expert
,
i_n
,
i_k
));
{
acc
+=
y
(
0
,
i_k
)
*
sy_host
(
i_expert
,
i_k
)
*
type_convert
<
float
>
(
d_host
(
i_expert
,
i_n
,
i_k
));
}
else
{
acc
+=
y
(
0
,
i_k
)
*
type_convert
<
float
>
(
d_host
(
i_expert
,
i_n
,
i_k
));
}
}
}
acc_1
(
0
,
i_n
)
=
acc
*
weight
;
// multiple weight here
acc_1
(
0
,
i_n
)
=
acc
*
type_convert
<
float
>
(
weight
)
;
// multiple weight here
}
}
for
(
ck_tile
::
index_t
i_n
=
0
;
i_n
<
hidden_size
;
i_n
++
)
for
(
ck_tile
::
index_t
i_n
=
0
;
i_n
<
hidden_size
;
i_n
++
)
...
@@ -177,7 +200,7 @@ void reference_fused_moe(
...
@@ -177,7 +200,7 @@ void reference_fused_moe(
auto
r
=
[
&
](
auto
i_token
)
{
auto
r
=
[
&
](
auto
i_token
)
{
for
(
ck_tile
::
index_t
i_n
=
0
;
i_n
<
hidden_size
;
i_n
++
)
for
(
ck_tile
::
index_t
i_n
=
0
;
i_n
<
hidden_size
;
i_n
++
)
{
{
AccDataType
acc
=
type_convert
<
AccDataType
>
(
0
);
AccDataType
acc
=
type_convert
<
float
>
(
0
);
for
(
ck_tile
::
index_t
i_topk
=
0
;
i_topk
<
topk
;
i_topk
++
)
for
(
ck_tile
::
index_t
i_topk
=
0
;
i_topk
<
topk
;
i_topk
++
)
{
{
acc
+=
out_topk_tokens
(
i_token
,
i_topk
,
i_n
);
acc
+=
out_topk_tokens
(
i_token
,
i_topk
,
i_n
);
...
...
include/ck_tile/ops/flatmm.hpp
View file @
a759277d
...
@@ -4,7 +4,9 @@
...
@@ -4,7 +4,9 @@
#pragma once
#pragma once
#include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
#include "ck_tile/ops/flatmm/block/flatmm_uk_config.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
...
...
include/ck_tile/ops/flatmm/block/flatmm_32x512x256_1x4x1_16x16x64_int8.hpp
View file @
a759277d
...
@@ -38,7 +38,7 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_Base // for int8/fp8
...
@@ -38,7 +38,7 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_Base // for int8/fp8
{
{
static
constexpr
index_t
Block_M
=
32
;
static
constexpr
index_t
Block_M
=
32
;
static
constexpr
index_t
Block_N
=
512
;
static
constexpr
index_t
Block_N
=
512
;
static
constexpr
index_t
Block_K
=
25
8
;
static
constexpr
index_t
Block_K
=
25
6
;
static
constexpr
index_t
WarpPerBlock_M
=
1
;
static
constexpr
index_t
WarpPerBlock_M
=
1
;
static
constexpr
index_t
WarpPerBlock_N
=
4
;
static
constexpr
index_t
WarpPerBlock_N
=
4
;
...
@@ -245,14 +245,14 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
...
@@ -245,14 +245,14 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
// TODO: need paired with tile_window_linear!
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// TODO: need call init_raw() before call this function!
template
<
typename
DQRes
,
typename
GQRes
,
typename
SMQRes
,
typename
ARes
,
typename
ACoords
,
typename
BRes
,
typename
BCoords
>
template
<
typename
AQRes
,
typename
DQRes
,
typename
GQRes
,
typename
SMQRes
,
typename
ARes
,
typename
ACoords
,
typename
BRes
,
typename
BCoords
>
CK_TILE_DEVICE
auto
CK_TILE_DEVICE
auto
operator
()(
index_t
row_ids_a_
,
operator
()(
index_t
row_ids_a_
,
const
DQ
es
&
res_aq
const
AQR
es
&
res_aq
,
const
DQes
&
res_dq
,
const
DQ
R
es
&
res_dq
,
const
GQRes
&
res_gq
,
const
GQRes
&
res_gq
,
const
SMQRes
&
res_smq
,
const
SMQRes
&
res_smq
,
const
Res
&
res_a
,
const
A
Res
&
res_a
,
const
ACoords
&
cached_coords_a
,
const
ACoords
&
cached_coords_a
,
const
BRes
&
res_b
,
const
BRes
&
res_b
,
const
BCoords
&
cached_coords_b
,
const
BCoords
&
cached_coords_b
,
...
@@ -302,7 +302,8 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
...
@@ -302,7 +302,8 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
},
},
number
<
a_sld
.
get_num_of_access
()
>
{});
number
<
a_sld
.
get_num_of_access
()
>
{});
index_t
loop_cnt
=
k
/
Block_K
;
//index_t loop_cnt = k / Block_K;
index_t
loop_cnt
=
k
;
// this is the acc thread buffer
// this is the acc thread buffer
register
int
v_z0
asm
(
"v128"
)
=
0
;
register
int
v_z0
asm
(
"v128"
)
=
0
;
...
@@ -398,7 +399,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
...
@@ -398,7 +399,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
// [v_acc_15]"+v"(v_acc[15]),
// [v_acc_15]"+v"(v_acc[15]),
[
v_token_id
]
"+v"
(
row_ids_a_
),
[
v_token_id
]
"+v"
(
row_ids_a_
),
[
s_mem_
]
"+r"
(
smem
)
[
s_mem_
]
"+r"
(
smem
)
:
[
s_res_dq0
]
"s"
(
res_dq
[
0
]),
:
[
s_res_aq0
]
"s"
(
res_aq
[
0
]),
[
s_res_aq1
]
"s"
(
res_aq
[
1
]),
[
s_res_aq2
]
"s"
(
res_aq
[
2
]),
[
s_res_aq3
]
"s"
(
res_aq
[
3
]),
[
s_res_dq0
]
"s"
(
res_dq
[
0
]),
[
s_res_dq1
]
"s"
(
res_dq
[
1
]),
[
s_res_dq1
]
"s"
(
res_dq
[
1
]),
[
s_res_dq2
]
"s"
(
res_dq
[
2
]),
[
s_res_dq2
]
"s"
(
res_dq
[
2
]),
[
s_res_dq3
]
"s"
(
res_dq
[
3
]),
[
s_res_dq3
]
"s"
(
res_dq
[
3
]),
...
@@ -436,10 +441,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
...
@@ -436,10 +441,11 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
[
v_os_b6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
6
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
6
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_sld
a
]
"v"
(
static_cast
<
index_t
>
(
a_sld
.
cached_coords_
[
number
<
0
>
{}].
get_offset
()
*
sizeof
(
ADataType
))),
[
v_os_sld
]
"v"
(
static_cast
<
index_t
>
(
a_sld
.
cached_coords_
[
number
<
0
>
{}].
get_offset
()
*
sizeof
(
ADataType
))),
[
s_m0_init
]
"s"
(
m0_init_value
),
[
s_m0_init
]
"s"
(
m0_init_value
),
[
s_size_per_issue
]
"s"
(
size_per_issue
),
[
s_size_per_issue
]
"s"
(
size_per_issue
),
[
smem_sz
]
"n"
(
smem_buf_size
),
//(smem_buf_size),
[
smem_sz
]
"n"
(
smem_buf_size
),
//(smem_buf_size),
[
s_wave_id
]
"s"
(
get_warp_id
()),
[
sld_os_0
]
"n"
(
sld_os
[
number
<
0
>
{}].
value
),
[
sld_os_0
]
"n"
(
sld_os
[
number
<
0
>
{}].
value
),
[
sld_os_1
]
"n"
(
sld_os
[
number
<
1
>
{}].
value
),
[
sld_os_1
]
"n"
(
sld_os
[
number
<
1
>
{}].
value
),
[
sld_os_2
]
"n"
(
sld_os
[
number
<
2
>
{}].
value
),
[
sld_os_2
]
"n"
(
sld_os
[
number
<
2
>
{}].
value
),
...
@@ -450,7 +456,8 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
...
@@ -450,7 +456,8 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
[
sld_os_7
]
"n"
(
sld_os
[
number
<
7
>
{}].
value
),
[
sld_os_7
]
"n"
(
sld_os
[
number
<
7
>
{}].
value
),
[
s_tile_os_a
]
"s"
(
tile_offset_a_bytes
),
[
s_tile_os_a
]
"s"
(
tile_offset_a_bytes
),
[
s_tile_os_b
]
"s"
(
tile_offset_b_bytes
)
[
s_tile_os_b
]
"s"
(
tile_offset_b_bytes
)
:
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
:
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
...
@@ -480,84 +487,113 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
...
@@ -480,84 +487,113 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s16"
,
"s17"
,
"s18"
,
"s19"
,
"s20"
,
"s21"
,
"s22"
,
"s23"
,
"s6"
,
"s7"
,
"s8"
,
"s9"
,
"s10"
,
"s11"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s86"
,
// s86 as tmp
"s16"
,
"s17"
,
"s18"
,
"s19"
,
"s20"
,
"s21"
,
"s22"
,
"s23"
,
"s24"
,
"s25"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"s26"
,
"s27"
,
"s28"
,
"s29"
,
"s30"
,
"s31"
,
"s32"
,
"s33"
,
"s34"
,
"s35"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"s36"
,
"s37"
,
"s38"
,
"s39"
,
"s40"
,
"s41"
,
"s42"
,
"s43"
,
"s44"
,
"s45"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"s46"
,
"s47"
,
"s48"
,
"s49"
,
"s50"
,
"s51"
,
"s52"
,
"s53"
,
"s54"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v96"
,
"v97"
,
"v98"
,
"v99"
,
"s55"
,
"s56"
,
"s57"
,
"s58"
,
"s59"
,
"s60"
,
"s61"
,
"s62"
,
"s63"
,
"v100"
,
"v101"
,
"v102"
,
"v103"
,
"v104"
,
"v105"
,
"v106"
,
"v107"
,
"s64"
,
"s65"
,
"s66"
,
"s67"
,
"s68"
,
"s69"
,
"s70"
,
"s71"
,
"s72"
,
"v108"
,
"v109"
,
"v110"
,
"v111"
,
"v112"
,
"v113"
,
"v114"
,
"v115"
,
"s73"
,
"s74"
,
"s75"
,
"s76"
,
"s77"
,
"s78"
,
"s79"
,
"s80"
,
// s86 as tmp
"v116"
,
"v117"
,
"v118"
,
"v119"
,
"v120"
,
"v121"
,
"v122"
,
"v123"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v124"
,
"v125"
,
"v126"
,
"v127"
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
,
"v29"
,
"v30"
,
"v31"
,
"v32"
,
"v33"
,
"v34"
,
"v35"
,
"v36"
,
"v37"
,
"v38"
,
"v39"
,
"v40"
,
"v41"
,
"v42"
,
"v43"
,
"v44"
,
"v45"
,
"v46"
,
"v47"
,
"v48"
,
"v49"
,
"v50"
,
"v51"
,
"v52"
,
"v53"
,
"v54"
,
"v55"
,
"v56"
,
"v57"
,
"v58"
,
"v59"
,
"v60"
,
"v61"
,
"v62"
,
"v63"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v96"
,
"v97"
,
"v98"
,
"v99"
,
"v100"
,
"v101"
,
"v102"
,
"v103"
,
"v104"
,
"v105"
,
"v106"
,
"v107"
,
"v108"
,
"v109"
,
"v110"
,
"v111"
,
"v112"
,
"v113"
,
"v114"
,
"v115"
,
"v116"
,
"v117"
,
"v118"
,
"v119"
,
"v120"
,
"v121"
,
"v122"
,
"v123"
,
"v124"
,
"v125"
,
"v126"
,
"v127"
,
"v128"
,
"v129"
,
"v130"
,
"v131"
,
"v132"
,
"v133"
,
"v134"
,
"v135"
,
"v136"
,
"v137"
,
"v138"
,
"v139"
,
"v140"
,
"v141"
,
"v142"
,
"v143"
,
"v144"
,
"v145"
,
"v146"
,
"v147"
,
"v148"
,
"v149"
,
"v150"
,
"v151"
,
"v152"
,
"v153"
,
"v154"
,
"v155"
,
"v156"
,
"v157"
,
"v158"
,
"v159"
,
"v160"
,
"v161"
,
"v162"
,
"v163"
,
"v164"
,
"v165"
,
"v166"
,
"v167"
,
"v168"
,
"v169"
,
"v170"
,
"v171"
,
"v172"
,
"v173"
,
"v174"
,
"v175"
,
"v176"
,
"v177"
,
"v178"
,
"v179"
,
"v180"
,
"v181"
,
"v182"
,
"v183"
,
"v184"
,
"v185"
,
"v186"
,
"v187"
,
"v188"
,
"v189"
,
"v190"
,
"v191"
,
"v192"
,
"v193"
,
"v194"
,
"v195"
,
"v196"
,
"v197"
,
"v198"
,
"v199"
,
"v200"
,
"v201"
,
"v202"
,
"v203"
,
"v204"
,
"v205"
,
"v206"
,
"v207"
,
"v208"
,
"v209"
,
"v210"
,
"v211"
,
"v212"
,
"v213"
,
"v214"
,
"v215"
,
"v216"
,
"v217"
,
"v218"
,
"v219"
,
"v220"
,
"v221"
,
"v222"
,
"v223"
,
"v224"
,
"v225"
,
"v226"
,
"v227"
,
"v228"
,
"v229"
,
"v230"
,
"v231"
,
"v232"
,
"v233"
,
"v234"
,
"v235"
,
"v236"
,
"v237"
,
"v238"
,
"v239"
,
"v240"
,
"v241"
,
"v242"
,
"v243"
,
"v244"
,
"v245"
,
"v246"
,
"v247"
,
"v248"
,
"v249"
,
"v250"
,
"v251"
,
"v252"
,
"v253"
,
"v254"
,
"v255"
);
);
// clang-format on
// clang-format on
#pragma clang diagnostic pop
#pragma clang diagnostic pop
int32x4_t
v_acc
[
16
]{
0
};
int32x4_t
v_acc
[
16
]{
0
};
v_acc
[
0
][
0
]
=
v_z128
;
v_acc
[
0
][
0
]
=
v_z0
;
v_acc
[
0
][
1
]
=
v_z1
29
;
v_acc
[
0
][
1
]
=
v_z1
;
v_acc
[
0
][
2
]
=
v_z130
;
v_acc
[
0
][
2
]
=
v_z2
;
v_acc
[
0
][
3
]
=
v_z131
;
v_acc
[
0
][
3
]
=
v_z3
;
v_acc
[
1
][
0
]
=
v_z132
;
v_acc
[
1
][
0
]
=
v_z4
;
v_acc
[
1
][
1
]
=
v_z133
;
v_acc
[
1
][
1
]
=
v_z5
;
v_acc
[
1
][
2
]
=
v_z134
;
v_acc
[
1
][
2
]
=
v_z6
;
v_acc
[
1
][
3
]
=
v_z135
;
v_acc
[
1
][
3
]
=
v_z7
;
v_acc
[
2
][
0
]
=
v_z136
;
v_acc
[
2
][
0
]
=
v_z8
;
v_acc
[
2
][
1
]
=
v_z137
;
v_acc
[
2
][
1
]
=
v_z9
;
v_acc
[
2
][
2
]
=
v_z1
38
;
v_acc
[
2
][
2
]
=
v_z1
0
;
v_acc
[
2
][
3
]
=
v_z1
39
;
v_acc
[
2
][
3
]
=
v_z1
1
;
v_acc
[
3
][
0
]
=
v_z1
40
;
v_acc
[
3
][
0
]
=
v_z1
2
;
v_acc
[
3
][
1
]
=
v_z1
41
;
v_acc
[
3
][
1
]
=
v_z1
3
;
v_acc
[
3
][
2
]
=
v_z14
2
;
v_acc
[
3
][
2
]
=
v_z14
;
v_acc
[
3
][
3
]
=
v_z1
43
;
v_acc
[
3
][
3
]
=
v_z1
5
;
v_acc
[
4
][
0
]
=
v_z1
44
;
v_acc
[
4
][
0
]
=
v_z1
6
;
v_acc
[
4
][
1
]
=
v_z1
45
;
v_acc
[
4
][
1
]
=
v_z1
7
;
v_acc
[
4
][
2
]
=
v_z1
46
;
v_acc
[
4
][
2
]
=
v_z1
8
;
v_acc
[
4
][
3
]
=
v_z1
47
;
v_acc
[
4
][
3
]
=
v_z1
9
;
v_acc
[
5
][
0
]
=
v_z148
;
v_acc
[
5
][
0
]
=
v_z20
;
v_acc
[
5
][
1
]
=
v_z149
;
v_acc
[
5
][
1
]
=
v_z21
;
v_acc
[
5
][
2
]
=
v_z150
;
v_acc
[
5
][
2
]
=
v_z22
;
v_acc
[
5
][
3
]
=
v_z151
;
v_acc
[
5
][
3
]
=
v_z23
;
v_acc
[
6
][
0
]
=
v_z152
;
v_acc
[
6
][
0
]
=
v_z24
;
v_acc
[
6
][
1
]
=
v_z153
;
v_acc
[
6
][
1
]
=
v_z25
;
v_acc
[
6
][
2
]
=
v_z154
;
v_acc
[
6
][
2
]
=
v_z26
;
v_acc
[
6
][
3
]
=
v_z155
;
v_acc
[
6
][
3
]
=
v_z27
;
v_acc
[
7
][
0
]
=
v_z156
;
v_acc
[
7
][
0
]
=
v_z28
;
v_acc
[
7
][
1
]
=
v_z157
;
v_acc
[
7
][
1
]
=
v_z29
;
v_acc
[
7
][
2
]
=
v_z158
;
v_acc
[
7
][
2
]
=
v_z30
;
v_acc
[
7
][
3
]
=
v_z159
;
v_acc
[
7
][
3
]
=
v_z31
;
v_acc
[
8
][
0
]
=
v_z160
;
v_acc
[
8
][
0
]
=
v_z32
;
v_acc
[
8
][
1
]
=
v_z161
;
v_acc
[
8
][
1
]
=
v_z33
;
v_acc
[
8
][
2
]
=
v_z162
;
v_acc
[
8
][
2
]
=
v_z34
;
v_acc
[
8
][
3
]
=
v_z163
;
v_acc
[
8
][
3
]
=
v_z35
;
v_acc
[
9
][
0
]
=
v_z164
;
v_acc
[
9
][
0
]
=
v_z36
;
v_acc
[
9
][
1
]
=
v_z165
;
v_acc
[
9
][
1
]
=
v_z37
;
v_acc
[
9
][
2
]
=
v_z166
;
v_acc
[
9
][
2
]
=
v_z38
;
v_acc
[
9
][
3
]
=
v_z167
;
v_acc
[
9
][
3
]
=
v_z39
;
v_acc
[
10
][
0
]
=
v_z168
;
v_acc
[
10
][
0
]
=
v_z40
;
v_acc
[
10
][
1
]
=
v_z169
;
v_acc
[
10
][
1
]
=
v_z41
;
v_acc
[
10
][
2
]
=
v_z170
;
v_acc
[
10
][
2
]
=
v_z42
;
v_acc
[
10
][
3
]
=
v_z171
;
v_acc
[
10
][
3
]
=
v_z43
;
v_acc
[
11
][
0
]
=
v_z172
;
v_acc
[
11
][
0
]
=
v_z44
;
v_acc
[
11
][
1
]
=
v_z173
;
v_acc
[
11
][
1
]
=
v_z45
;
v_acc
[
11
][
2
]
=
v_z174
;
v_acc
[
11
][
2
]
=
v_z46
;
v_acc
[
11
][
3
]
=
v_z175
;
v_acc
[
11
][
3
]
=
v_z47
;
v_acc
[
12
][
0
]
=
v_z176
;
v_acc
[
12
][
0
]
=
v_z48
;
v_acc
[
12
][
1
]
=
v_z177
;
v_acc
[
12
][
1
]
=
v_z49
;
v_acc
[
12
][
2
]
=
v_z178
;
v_acc
[
12
][
2
]
=
v_z50
;
v_acc
[
12
][
3
]
=
v_z179
;
v_acc
[
12
][
3
]
=
v_z51
;
v_acc
[
13
][
0
]
=
v_z180
;
v_acc
[
13
][
0
]
=
v_z52
;
v_acc
[
13
][
1
]
=
v_z181
;
v_acc
[
13
][
1
]
=
v_z53
;
v_acc
[
13
][
2
]
=
v_z182
;
v_acc
[
13
][
2
]
=
v_z54
;
v_acc
[
13
][
3
]
=
v_z183
;
v_acc
[
13
][
3
]
=
v_z55
;
v_acc
[
14
][
0
]
=
v_z184
;
v_acc
[
14
][
0
]
=
v_z56
;
v_acc
[
14
][
1
]
=
v_z185
;
v_acc
[
14
][
1
]
=
v_z57
;
v_acc
[
14
][
2
]
=
v_z186
;
v_acc
[
14
][
2
]
=
v_z58
;
v_acc
[
14
][
3
]
=
v_z187
;
v_acc
[
14
][
3
]
=
v_z59
;
v_acc
[
15
][
0
]
=
v_z188
;
v_acc
[
15
][
0
]
=
v_z60
;
v_acc
[
15
][
1
]
=
v_z189
;
v_acc
[
15
][
1
]
=
v_z61
;
v_acc
[
15
][
2
]
=
v_z190
;
v_acc
[
15
][
2
]
=
v_z62
;
v_acc
[
15
][
3
]
=
v_z191
;
v_acc
[
15
][
3
]
=
v_z63
;
// return local scratch
// return local scratch
auto
c
=
MakeCBlockTile
();
auto
c
=
MakeCBlockTile
();
...
@@ -572,191 +608,4 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
...
@@ -572,191 +608,4 @@ struct Flatmm_32x512x256_1x4x1_16x16x64_int8 : public Flatmm_32x512x256_1x4x1_16
}
}
};
};
struct
Flatmm_32x512x128_1x4x1_16x16x32_FP16
:
public
Flatmm_32x512x128_1x4x1_16x16x32_Base
{
using
ADataType
=
fp16_t
;
using
BDataType
=
fp16_t
;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
template
<
typename
ARes
,
typename
ACoords
,
typename
BRes
,
typename
BCoords
>
CK_TILE_DEVICE
auto
operator
()(
const
ARes
&
res_a
,
const
ACoords
&
cached_coords_a
,
const
BRes
&
res_b
,
const
BCoords
&
cached_coords_b
,
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
k
,
index_t
tile_offset_a
,
// for each tile, the offset to move for each unroll
index_t
tile_offset_b
)
// for each tile, the offset to move for each unroll
{
static_assert
(
ACoords
::
size
()
==
Block_M
*
Block_K
/
BlockSize
/
2
/*2x per dword*/
);
// 8
static_assert
(
BCoords
::
size
()
==
Repeat_N
);
auto
a_sst
=
make_tile_window
(
make_tensor_view
<
address_space_enum
::
lds
>
(
reinterpret_cast
<
CK_TILE_LDS_ADDR
ADataType
*>
(
smem
),
MakeLdsStoreDesc_A
()),
MakeLdsStoreDesc_A
().
get_lengths
(),
{
0
,
0
,
0
});
auto
a_sld
=
[
&
]()
{
constexpr
auto
a_warp_enc_
=
GetGemm_AWarpEnc
();
constexpr
auto
a_outer_dstr_enc
=
tile_distribution_encoding
<
sequence
<
WarpPerBlock_N
>
,
tuple
<
sequence
<
Repeat_M
,
WarpPerBlock_M
>
,
sequence
<
Repeat_K
>>
,
tuple
<
sequence
<
1
,
0
>>
,
tuple
<
sequence
<
1
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
0
>>
{};
constexpr
auto
a_block_dstr_encode
=
detail
::
make_embed_tile_distribution_encoding
(
a_outer_dstr_enc
,
a_warp_enc_
);
return
make_tile_window_linear
(
make_tensor_view
<
address_space_enum
::
lds
>
(
reinterpret_cast
<
CK_TILE_LDS_ADDR
ADataType
*>
(
smem
),
MakeLdsLoadDesc_A
()),
MakeLdsLoadDesc_A
().
get_lengths
(),
{
0
,
0
},
make_static_tile_distribution
(
a_block_dstr_encode
));
}();
const
index_t
tile_offset_a_bytes
=
tile_offset_a
*
sizeof
(
ADataType
);
const
index_t
tile_offset_b_bytes
=
tile_offset_b
*
sizeof
(
BDataType
);
const
auto
[
m0_init_value
,
size_per_issue
]
=
get_async_store_smem_info
(
a_sst
);
constexpr
auto
smem_buf_size
=
MakeLdsLoadDesc_A
().
get_element_space_size
()
*
sizeof
(
ADataType
);
static_assert
(
a_sld
.
get_num_of_access
()
==
8
);
constexpr
auto
sld_os
=
generate_tuple
(
[
&
](
auto
i_access
)
{
return
number
<
a_sld
.
get_bottom_linear_offset
(
i_access
)
*
sizeof
(
ADataType
)
>
{};
},
number
<
a_sld
.
get_num_of_access
()
>
{});
index_t
loop_cnt
=
k
/
Block_K
;
// this is the acc thread buffer
fp32x4_t
v_acc
[
16
]{
.0
f
};
// B nr->kr
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
// clang-format off
asm
volatile
(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:
[
s_loop_cnt
]
"+s"
(
loop_cnt
),
[
v_acc_0
]
"+v"
(
v_acc
[
0
]),
[
v_acc_1
]
"+v"
(
v_acc
[
1
]),
[
v_acc_2
]
"+v"
(
v_acc
[
2
]),
[
v_acc_3
]
"+v"
(
v_acc
[
3
]),
[
v_acc_4
]
"+v"
(
v_acc
[
4
]),
[
v_acc_5
]
"+v"
(
v_acc
[
5
]),
[
v_acc_6
]
"+v"
(
v_acc
[
6
]),
[
v_acc_7
]
"+v"
(
v_acc
[
7
]),
[
v_acc_8
]
"+v"
(
v_acc
[
8
]),
[
v_acc_9
]
"+v"
(
v_acc
[
9
]),
[
v_acc_10
]
"+v"
(
v_acc
[
10
]),
[
v_acc_11
]
"+v"
(
v_acc
[
11
]),
[
v_acc_12
]
"+v"
(
v_acc
[
12
]),
[
v_acc_13
]
"+v"
(
v_acc
[
13
]),
[
v_acc_14
]
"+v"
(
v_acc
[
14
]),
[
v_acc_15
]
"+v"
(
v_acc
[
15
]),
[
s_mem_
]
"+r"
(
smem
)
:
[
s_res_a0
]
"s"
(
res_a
[
0
]),
[
s_res_a1
]
"s"
(
res_a
[
1
]),
[
s_res_a2
]
"s"
(
res_a
[
2
]),
[
s_res_a3
]
"s"
(
res_a
[
3
]),
[
s_res_b0
]
"s"
(
res_b
[
0
]),
[
s_res_b1
]
"s"
(
res_b
[
1
]),
[
s_res_b2
]
"s"
(
res_b
[
2
]),
[
s_res_b3
]
"s"
(
res_b
[
3
]),
[
v_os_a0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
0
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
1
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
2
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
3
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
4
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
5
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
6
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_a7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_a
[
number
<
7
>
{}]
*
sizeof
(
ADataType
))),
[
v_os_b0
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
0
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b1
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
1
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b2
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
2
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b3
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
3
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b4
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
4
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b5
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
5
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b6
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
6
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_slda
]
"v"
(
static_cast
<
index_t
>
(
a_sld
.
cached_coords_
[
number
<
0
>
{}].
get_offset
()
*
sizeof
(
ADataType
))),
[
s_m0_init
]
"s"
(
m0_init_value
),
[
s_size_per_issue
]
"s"
(
size_per_issue
),
[
smem_sz
]
"n"
(
smem_buf_size
),
//(smem_buf_size),
[
sld_os_0
]
"n"
(
sld_os
[
number
<
0
>
{}].
value
),
[
sld_os_1
]
"n"
(
sld_os
[
number
<
1
>
{}].
value
),
[
sld_os_2
]
"n"
(
sld_os
[
number
<
2
>
{}].
value
),
[
sld_os_3
]
"n"
(
sld_os
[
number
<
3
>
{}].
value
),
[
sld_os_4
]
"n"
(
sld_os
[
number
<
4
>
{}].
value
),
[
sld_os_5
]
"n"
(
sld_os
[
number
<
5
>
{}].
value
),
[
sld_os_6
]
"n"
(
sld_os
[
number
<
6
>
{}].
value
),
[
sld_os_7
]
"n"
(
sld_os
[
number
<
7
>
{}].
value
),
[
s_tile_os_a
]
"s"
(
tile_offset_a_bytes
),
[
s_tile_os_b
]
"s"
(
tile_offset_b_bytes
)
:
"memory"
,
"a0"
,
"a1"
,
"a2"
,
"a3"
,
"a4"
,
"a5"
,
"a6"
,
"a7"
,
"a8"
,
"a9"
,
"a10"
,
"a11"
,
"a12"
,
"a13"
,
"a14"
,
"a15"
,
"a16"
,
"a17"
,
"a18"
,
"a19"
,
"a20"
,
"a21"
,
"a22"
,
"a23"
,
"a24"
,
"a25"
,
"a26"
,
"a27"
,
"a28"
,
"a29"
,
"a30"
,
"a31"
,
"a32"
,
"a33"
,
"a34"
,
"a35"
,
"a36"
,
"a37"
,
"a38"
,
"a39"
,
"a40"
,
"a41"
,
"a42"
,
"a43"
,
"a44"
,
"a45"
,
"a46"
,
"a47"
,
"a48"
,
"a49"
,
"a50"
,
"a51"
,
"a52"
,
"a53"
,
"a54"
,
"a55"
,
"a56"
,
"a57"
,
"a58"
,
"a59"
,
"a60"
,
"a61"
,
"a62"
,
"a63"
,
"a64"
,
"a65"
,
"a66"
,
"a67"
,
"a68"
,
"a69"
,
"a70"
,
"a71"
,
"a72"
,
"a73"
,
"a74"
,
"a75"
,
"a76"
,
"a77"
,
"a78"
,
"a79"
,
"a80"
,
"a81"
,
"a82"
,
"a83"
,
"a84"
,
"a85"
,
"a86"
,
"a87"
,
"a88"
,
"a89"
,
"a90"
,
"a91"
,
"a92"
,
"a93"
,
"a94"
,
"a95"
,
"a96"
,
"a97"
,
"a98"
,
"a99"
,
"a100"
,
"a101"
,
"a102"
,
"a103"
,
"a104"
,
"a105"
,
"a106"
,
"a107"
,
"a108"
,
"a109"
,
"a110"
,
"a111"
,
"a112"
,
"a113"
,
"a114"
,
"a115"
,
"a116"
,
"a117"
,
"a118"
,
"a119"
,
"a120"
,
"a121"
,
"a122"
,
"a123"
,
"a124"
,
"a125"
,
"a126"
,
"a127"
,
"a128"
,
"a129"
,
"a130"
,
"a131"
,
"a132"
,
"a133"
,
"a134"
,
"a135"
,
"a136"
,
"a137"
,
"a138"
,
"a139"
,
"a140"
,
"a141"
,
"a142"
,
"a143"
,
"a144"
,
"a145"
,
"a146"
,
"a147"
,
"a148"
,
"a149"
,
"a150"
,
"a151"
,
"a152"
,
"a153"
,
"a154"
,
"a155"
,
"a156"
,
"a157"
,
"a158"
,
"a159"
,
"a160"
,
"a161"
,
"a162"
,
"a163"
,
"a164"
,
"a165"
,
"a166"
,
"a167"
,
"a168"
,
"a169"
,
"a170"
,
"a171"
,
"a172"
,
"a173"
,
"a174"
,
"a175"
,
"a176"
,
"a177"
,
"a178"
,
"a179"
,
"a180"
,
"a181"
,
"a182"
,
"a183"
,
"a184"
,
"a185"
,
"a186"
,
"a187"
,
"a188"
,
"a189"
,
"a190"
,
"a191"
,
"a192"
,
"a193"
,
"a194"
,
"a195"
,
"a196"
,
"a197"
,
"a198"
,
"a199"
,
"a200"
,
"a201"
,
"a202"
,
"a203"
,
"a204"
,
"a205"
,
"a206"
,
"a207"
,
"a208"
,
"a209"
,
"a210"
,
"a211"
,
"a212"
,
"a213"
,
"a214"
,
"a215"
,
"a216"
,
"a217"
,
"a218"
,
"a219"
,
"a220"
,
"a221"
,
"a222"
,
"a223"
,
"a224"
,
"a225"
,
"a226"
,
"a227"
,
"a228"
,
"a229"
,
"a230"
,
"a231"
,
"a232"
,
"a233"
,
"a234"
,
"a235"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s16"
,
"s17"
,
"s18"
,
"s19"
,
"s20"
,
"s21"
,
"s22"
,
"s23"
,
"s86"
,
// s86 as tmp
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v96"
,
"v97"
,
"v98"
,
"v99"
,
"v100"
,
"v101"
,
"v102"
,
"v103"
,
"v104"
,
"v105"
,
"v106"
,
"v107"
,
"v108"
,
"v109"
,
"v110"
,
"v111"
,
"v112"
,
"v113"
,
"v114"
,
"v115"
,
"v116"
,
"v117"
,
"v118"
,
"v119"
,
"v120"
,
"v121"
,
"v122"
,
"v123"
,
"v124"
,
"v125"
,
"v126"
,
"v127"
);
// clang-format on
#pragma clang diagnostic pop
// return local scratch
auto
c
=
MakeCBlockTile
();
for
(
auto
i
=
0
;
i
<
16
;
i
++
)
{
c
.
get_thread_buffer
()[
4
*
i
+
0
]
=
v_acc
[
i
].
x
;
c
.
get_thread_buffer
()[
4
*
i
+
1
]
=
v_acc
[
i
].
y
;
c
.
get_thread_buffer
()[
4
*
i
+
2
]
=
v_acc
[
i
].
z
;
c
.
get_thread_buffer
()[
4
*
i
+
3
]
=
v_acc
[
i
].
w
;
}
return
c
;
}
};
}
// namespace ck_tile
}
// namespace ck_tile
include/ck_tile/ops/flatmm/block/flatmm_sn_32x256x512_1x4x1_16x16x64_int8.hpp
View file @
a759277d
...
@@ -73,6 +73,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
...
@@ -73,6 +73,7 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
{
{
using
BDataType
=
int8_t
;
using
BDataType
=
int8_t
;
using
ODataType
=
int8_t
;
using
ODataType
=
int8_t
;
using
DScaleDataType
=
float_t
;
// TODO: need paired with tile_window_linear!
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// TODO: need call init_raw() before call this function!
...
@@ -111,38 +112,38 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
...
@@ -111,38 +112,38 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
index_t
loop_cnt
=
n
/
Block_N
;
index_t
loop_cnt
=
n
/
Block_N
;
register
float
v_c0
asm
(
"v64"
);
//
register float v_c0 asm("v64");
register
float
v_c1
asm
(
"v65"
);
//
register float v_c1 asm("v65");
register
float
v_c2
asm
(
"v66"
);
//
register float v_c2 asm("v66");
register
float
v_c3
asm
(
"v67"
);
//
register float v_c3 asm("v67");
register
float
v_c4
asm
(
"v68"
);
//
register float v_c4 asm("v68");
register
float
v_c5
asm
(
"v69"
);
//
register float v_c5 asm("v69");
register
float
v_c6
asm
(
"v70"
);
//
register float v_c6 asm("v70");
register
float
v_c7
asm
(
"v71"
);
//
register float v_c7 asm("v71");
register
float
v_c8
asm
(
"v72"
);
//
register float v_c8 asm("v72");
register
float
v_c9
asm
(
"v73"
);
//
register float v_c9 asm("v73");
register
float
v_c10
asm
(
"v74"
);
//
register float v_c10 asm("v74");
register
float
v_c11
asm
(
"v75"
);
//
register float v_c11 asm("v75");
register
float
v_c12
asm
(
"v76"
);
//
register float v_c12 asm("v76");
register
float
v_c13
asm
(
"v77"
);
//
register float v_c13 asm("v77");
register
float
v_c14
asm
(
"v78"
);
//
register float v_c14 asm("v78");
register
float
v_c15
asm
(
"v79"
);
//
register float v_c15 asm("v79");
register
float
v_c16
asm
(
"v80"
);
//
register float v_c16 asm("v80");
register
float
v_c17
asm
(
"v81"
);
//
register float v_c17 asm("v81");
register
float
v_c18
asm
(
"v82"
);
//
register float v_c18 asm("v82");
register
float
v_c19
asm
(
"v83"
);
//
register float v_c19 asm("v83");
register
float
v_c20
asm
(
"v84"
);
//
register float v_c20 asm("v84");
register
float
v_c21
asm
(
"v85"
);
//
register float v_c21 asm("v85");
register
float
v_c22
asm
(
"v86"
);
//
register float v_c22 asm("v86");
register
float
v_c23
asm
(
"v87"
);
//
register float v_c23 asm("v87");
register
float
v_c24
asm
(
"v88"
);
//
register float v_c24 asm("v88");
register
float
v_c25
asm
(
"v89"
);
//
register float v_c25 asm("v89");
register
float
v_c26
asm
(
"v90"
);
//
register float v_c26 asm("v90");
register
float
v_c27
asm
(
"v91"
);
//
register float v_c27 asm("v91");
register
float
v_c28
asm
(
"v92"
);
//
register float v_c28 asm("v92");
register
float
v_c29
asm
(
"v93"
);
//
register float v_c29 asm("v93");
register
float
v_c30
asm
(
"v94"
);
//
register float v_c30 asm("v94");
register
float
v_c31
asm
(
"v95"
);
//
register float v_c31 asm("v95");
int32_t
nan_hi
=
0x7fff0000
;
int32_t
nan_hi
=
0x7fff0000
;
int32_t
nan_lo
=
0x00007fff
;
int32_t
nan_lo
=
0x00007fff
;
...
@@ -175,45 +176,44 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
...
@@ -175,45 +176,44 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
#pragma clang diagnostic push
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
#pragma clang diagnostic ignored "-Winline-asm"
asm
volatile
(
asm
volatile
(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_
BF16
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_
INT8
#include "uk/flatmm_sn_uk_gfx9_32x
128
x512_1x4x1_16x16x1
6
.inc"
#include "uk/flatmm_sn_uk_gfx9_32x
256
x512_1x4x1_16x16x
32_int8_
1.inc"
#undef CK_TILE_FLATMM_UK_MFMA
#undef CK_TILE_FLATMM_UK_MFMA
:
[
smem_
]
"+r"
(
smem
),
:
[
smem_
]
"+r"
(
smem
),
[
s_loop_cnt
]
"+s"
(
loop_cnt
),
[
s_loop_cnt
]
"+s"
(
loop_cnt
)
[
c0
]
"+v"
(
v_c0
),
// [c0]"+v" (v_c0),
[
c1
]
"+v"
(
v_c1
),
// [c1]"+v" (v_c1),
[
c2
]
"+v"
(
v_c2
),
// [c2]"+v" (v_c2),
[
c3
]
"+v"
(
v_c3
),
// [c3]"+v" (v_c3),
[
c4
]
"+v"
(
v_c4
),
// [c4]"+v" (v_c4),
[
c5
]
"+v"
(
v_c5
),
// [c5]"+v" (v_c5),
[
c6
]
"+v"
(
v_c6
),
// [c6]"+v" (v_c6),
[
c7
]
"+v"
(
v_c7
),
// [c7]"+v" (v_c7),
[
c8
]
"+v"
(
v_c8
),
// [c8]"+v" (v_c8),
[
c9
]
"+v"
(
v_c9
),
// [c9]"+v" (v_c9),
[
c10
]
"+v"
(
v_c10
),
// [c10]"+v"(v_c10),
[
c11
]
"+v"
(
v_c11
),
// [c11]"+v"(v_c11),
[
c12
]
"+v"
(
v_c12
),
// [c12]"+v"(v_c12),
[
c13
]
"+v"
(
v_c13
),
// [c13]"+v"(v_c13),
[
c14
]
"+v"
(
v_c14
),
// [c14]"+v"(v_c14),
[
c15
]
"+v"
(
v_c15
),
// [c15]"+v"(v_c15),
[
c16
]
"+v"
(
v_c16
),
// [c16]"+v"(v_c16),
[
c17
]
"+v"
(
v_c17
),
// [c17]"+v"(v_c17),
[
c18
]
"+v"
(
v_c18
),
// [c18]"+v"(v_c18),
[
c19
]
"+v"
(
v_c19
),
// [c19]"+v"(v_c19),
[
c20
]
"+v"
(
v_c20
),
// [c20]"+v"(v_c20),
[
c21
]
"+v"
(
v_c21
),
// [c21]"+v"(v_c21),
[
c22
]
"+v"
(
v_c22
),
// [c22]"+v"(v_c22),
[
c23
]
"+v"
(
v_c23
),
// [c23]"+v"(v_c23),
[
c24
]
"+v"
(
v_c24
),
// [c24]"+v"(v_c24),
[
c25
]
"+v"
(
v_c25
),
// [c25]"+v"(v_c25),
[
c26
]
"+v"
(
v_c26
),
// [c26]"+v"(v_c26),
[
c27
]
"+v"
(
v_c27
),
// [c27]"+v"(v_c27),
[
c28
]
"+v"
(
v_c28
),
// [c28]"+v"(v_c28),
[
c29
]
"+v"
(
v_c29
),
// [c29]"+v"(v_c29),
[
c30
]
"+v"
(
v_c30
),
// [c30]"+v"(v_c30),
[
c31
]
"+v"
(
v_c31
)
// [c31]"+v"(v_c31)
:
:
[
sld_a_base
]
"n"
(
0
),
[
sld_a_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
[
v_sld_y_os
]
"v"
(
sld_y_os
),
[
v_sld_y_os
]
"v"
(
sld_y_os
),
[
v_sfl_sld
]
"v"
(
sfl_sld
),
[
v_sfl_sld
]
"v"
(
sfl_sld
),
...
@@ -290,177 +290,85 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
...
@@ -290,177 +290,85 @@ struct FlatmmSn_32x256x512_1x4x1_16x16x64_int8 : public FlatmmSn_32x256x512_1x4x
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s8"
,
"s9"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s38"
,
"s39"
,
"s52"
,
"s86"
,
"s6"
,
"s7"
,
"s8"
,
"s9"
,
"s10"
,
"s11"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s36"
,
"s37"
,
"s16"
,
"s17"
,
"s18"
,
"s19"
,
"s20"
,
"s21"
,
"s22"
,
"s23"
,
"s24"
,
"s25"
,
"v50"
,
"v54"
,
"v55"
,
"s26"
,
"s27"
,
"s28"
,
"s29"
,
"s30"
,
"s31"
,
"s32"
,
"s33"
,
"s34"
,
"s35"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"s36"
,
"s37"
,
"s38"
,
"s39"
,
"s40"
,
"s41"
,
"s42"
,
"s43"
,
"s44"
,
"s45"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"s46"
,
"s47"
,
"s48"
,
"s49"
,
"s50"
,
"s51"
,
"s52"
,
"s53"
,
"s54"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"s55"
,
"s56"
,
"s57"
,
"s58"
,
"s59"
,
"s60"
,
"s61"
,
"s62"
,
"s63"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"s64"
,
"s65"
,
"s66"
,
"s67"
,
"s68"
,
"s69"
,
"s70"
,
"s71"
,
"s72"
,
"v128"
,
"v129"
,
"v130"
,
"v131"
,
"s73"
,
"s74"
,
"s75"
,
"s76"
,
"s77"
,
"s78"
,
"s79"
,
"s80"
,
// s86 as tmp
"v132"
,
"v133"
,
"v134"
,
"v135"
,
"v136"
,
"v137"
,
"v138"
,
"v139"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v140"
,
"v141"
,
"v142"
,
"v143"
,
"v144"
,
"v145"
,
"v146"
,
"v147"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v148"
,
"v149"
,
"v150"
,
"v151"
,
"v152"
,
"v153"
,
"v154"
,
"v155"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
,
"v156"
,
"v157"
,
"v158"
,
"v159"
,
"v160"
,
"v161"
,
"v162"
,
"v163"
,
"v29"
,
"v30"
,
"v31"
,
"v32"
,
"v33"
,
"v34"
,
"v35"
,
"v36"
,
"v37"
,
"v164"
,
"v165"
,
"v166"
,
"v167"
,
"v168"
,
"v169"
,
"v170"
,
"v171"
,
"v38"
,
"v39"
,
"v40"
,
"v41"
,
"v42"
,
"v43"
,
"v44"
,
"v45"
,
"v46"
,
"v172"
,
"v173"
,
"v174"
,
"v175"
,
"v176"
,
"v177"
,
"v178"
,
"v179"
,
"v47"
,
"v48"
,
"v49"
,
"v50"
,
"v51"
,
"v52"
,
"v53"
,
"v54"
,
"v55"
,
"v180"
,
"v181"
,
"v182"
,
"v183"
,
"v184"
,
"v185"
,
"v186"
,
"v187"
,
"v56"
,
"v57"
,
"v58"
,
"v59"
,
"v60"
,
"v61"
,
"v62"
,
"v63"
,
"v64"
,
"v188"
,
"v189"
,
"v190"
,
"v191"
,
"v192"
,
"v193"
,
"v194"
,
"v195"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v196"
,
"v197"
,
"v198"
,
"v199"
,
"v200"
,
"v201"
,
"v202"
,
"v203"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v204"
,
"v205"
,
"v206"
,
"v207"
,
"v208"
,
"v209"
,
"v210"
,
"v211"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v212"
,
"v213"
,
"v214"
,
"v215"
,
"v216"
,
"v217"
,
"v218"
,
"v219"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v96"
,
"v97"
,
"v98"
,
"v99"
,
"v100"
,
"v220"
,
"v221"
,
"v222"
,
"v223"
,
"v224"
,
"v225"
,
"v226"
,
"v227"
,
"v101"
,
"v102"
,
"v103"
,
"v104"
,
"v105"
,
"v106"
,
"v107"
,
"v108"
,
"v228"
,
"v229"
,
"v230"
,
"v231"
,
"v232"
,
"v233"
,
"v234"
,
"v235"
,
"v109"
,
"v110"
,
"v111"
,
"v112"
,
"v113"
,
"v114"
,
"v115"
,
"v116"
,
"v236"
,
"v237"
,
"v238"
,
"v239"
,
"v240"
,
"v241"
,
"v242"
,
"v243"
,
"v117"
,
"v118"
,
"v119"
,
"v120"
,
"v121"
,
"v122"
,
"v123"
,
"v124"
,
"v244"
,
"v245"
,
"v246"
,
"v247"
,
"v248"
,
"v249"
,
"v250"
,
"v251"
,
"v125"
,
"v126"
,
"v127"
,
"v128"
,
"v129"
,
"v130"
,
"v131"
,
"v132"
,
"v252"
,
"v253"
,
"v254"
,
"v255"
"v133"
,
"v134"
,
"v135"
,
"v136"
,
"v137"
,
"v138"
,
"v139"
,
"v140"
,
"v141"
,
"v142"
,
"v143"
,
"v144"
,
"v145"
,
"v146"
,
"v147"
,
"v148"
,
"v149"
,
"v150"
,
"v151"
,
"v152"
,
"v153"
,
"v154"
,
"v155"
,
"v156"
,
"v157"
,
"v158"
,
"v159"
,
"v160"
,
"v161"
,
"v162"
,
"v163"
,
"v164"
,
"v165"
,
"v166"
,
"v167"
,
"v168"
,
"v169"
,
"v170"
,
"v171"
,
"v172"
,
"v173"
,
"v174"
,
"v175"
,
"v176"
,
"v177"
,
"v178"
,
"v179"
,
"v180"
,
"v181"
,
"v182"
,
"v183"
,
"v184"
,
"v185"
,
"v186"
,
"v187"
,
"v188"
,
"v189"
,
"v190"
,
"v191"
,
"v192"
,
"v193"
,
"v194"
,
"v195"
,
"v196"
,
"v197"
,
"v198"
,
"v199"
,
"v200"
,
"v201"
,
"v202"
,
"v203"
,
"v204"
,
"v205"
,
"v206"
,
"v207"
,
"v208"
,
"v209"
,
"v210"
,
"v211"
,
"v212"
,
"v213"
,
"v214"
,
"v215"
,
"v216"
,
"v217"
,
"v218"
,
"v219"
,
"v220"
,
"v221"
,
"v222"
,
"v223"
,
"v224"
,
"v225"
,
"v226"
,
"v227"
,
"v228"
,
"v229"
,
"v230"
,
"v231"
,
"v232"
,
"v233"
,
"v234"
,
"v235"
,
"v236"
,
"v237"
,
"v238"
,
"v239"
,
"v240"
,
"v241"
,
"v242"
,
"v243"
,
"v244"
,
"v245"
,
"v246"
,
"v247"
,
"v248"
,
"v249"
,
"v250"
,
"v251"
,
"v252"
,
"v253"
,
"v254"
,
"v255"
);
);
#pragma clang diagnostic pop
// clang-format on
}
};
struct
FlatmmSn_32x128x512_1x4x1_16x16x32_FP16
:
public
FlatmmSn_32x128x512_1x4x1_16x16x32_Base
{
using
BDataType
=
bf16_t
;
using
ODataType
=
bf16_t
;
// TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function!
// template <typename AWindow, typename BWindow, typename OWindow, typename ScaleTensor>
template
<
typename
BRes
,
typename
BCoords
,
typename
ORes
,
typename
OCoords
,
typename
OFlags
,
typename
ScaleTensor
>
CK_TILE_DEVICE
auto
operator
()(
const
BRes
&
res_b
,
const
BCoords
&
cached_coords_b
,
const
ORes
&
res_o
,
const
OCoords
&
cached_coords_o
,
const
OFlags
&
o_flags
,
// this should be in sgpr
CK_TILE_LDS_ADDR
void
*
smem
,
index_t
n
,
// loop along n dim
const
ScaleTensor
&
scale_
,
index_t
tile_offset_b
,
// stride b is fixed to blockKr * blockW, but still can adjust
index_t
tile_offset_o
)
{
static_assert
(
BCoords
::
size
()
==
8
);
// 8
static_assert
(
OCoords
::
size
()
==
8
);
const
index_t
tile_stride_b_bytes
=
tile_offset_b
*
sizeof
(
BDataType
);
const
index_t
tile_stride_o_bytes
=
tile_offset_o
*
sizeof
(
ODataType
);
static_assert
(
ScaleTensor
::
size
()
==
2
);
float
s0
=
scale_
[
number
<
0
>
{}];
float
s1
=
scale_
[
number
<
1
>
{}];
index_t
loop_cnt
=
n
/
Block_N
;
register
float
v_c0
asm
(
"v64"
);
register
float
v_c1
asm
(
"v65"
);
register
float
v_c2
asm
(
"v66"
);
register
float
v_c3
asm
(
"v67"
);
register
float
v_c4
asm
(
"v68"
);
register
float
v_c5
asm
(
"v69"
);
register
float
v_c6
asm
(
"v70"
);
register
float
v_c7
asm
(
"v71"
);
register
float
v_c8
asm
(
"v72"
);
register
float
v_c9
asm
(
"v73"
);
register
float
v_c10
asm
(
"v74"
);
register
float
v_c11
asm
(
"v75"
);
register
float
v_c12
asm
(
"v76"
);
register
float
v_c13
asm
(
"v77"
);
register
float
v_c14
asm
(
"v78"
);
register
float
v_c15
asm
(
"v79"
);
register
float
v_c16
asm
(
"v80"
);
register
float
v_c17
asm
(
"v81"
);
register
float
v_c18
asm
(
"v82"
);
register
float
v_c19
asm
(
"v83"
);
register
float
v_c20
asm
(
"v84"
);
register
float
v_c21
asm
(
"v85"
);
register
float
v_c22
asm
(
"v86"
);
register
float
v_c23
asm
(
"v87"
);
register
float
v_c24
asm
(
"v88"
);
register
float
v_c25
asm
(
"v89"
);
register
float
v_c26
asm
(
"v90"
);
register
float
v_c27
asm
(
"v91"
);
register
float
v_c28
asm
(
"v92"
);
register
float
v_c29
asm
(
"v93"
);
register
float
v_c30
asm
(
"v94"
);
register
float
v_c31
asm
(
"v95"
);
int32_t
nan_hi
=
0x7fff0000
;
int32_t
nan_lo
=
0x00007fff
;
// in smem, the layout is M0(2)*K0(128)*M1(16)*K1(4)
// every threads need 8xK in contiguous register
// ... and every wave need the same data
int
lane_id
=
threadIdx
.
x
%
64
;
int
sld_y_os
=
(
lane_id
%
16
)
*
4
+
(
lane_id
/
16
)
*
128
;
sld_y_os
*=
2
;
// y y p p p y
// reg before shfl M0(2)*N0(2)*Nl(4)*Nw(4)*Mw(16)*Nv(4)
// but order is N0*M0*Nv
// in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v
// sst(v3) = (v0/16*34 + v0%16 * 2 + wid*136) * 4
int
sfl_sst
=
(
threadIdx
.
x
%
16
*
4
)
+
(
threadIdx
.
x
/
16
)
*
(
64
+
4
);
sfl_sst
*=
2
;
// from LDS we need load as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16) * Nv(4) + 4)
// ( 2 issue) (rem 32-lane) (4 wave*4issue) 2lane*1ussue(pk2)
// sld(v4) = v0/2 *34*4 + v0 % 2 *4 + wid*2 *4
int
sfl_sld
=
(
lane_id
%
2
)
*
2
+
(
lane_id
/
2
)
*
(
64
+
4
)
+
(
threadIdx
.
x
/
64
)
*
4
;
sfl_sld
*=
2
;
// B nr->kr
// clang-format off
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
asm
volatile
(
asm
volatile
(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_
FP16
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_
INT8
#include "uk/flatmm_sn_uk_gfx9_32x
128
x512_1x4x1_16x16x
16
.inc"
#include "uk/flatmm_sn_uk_gfx9_32x
256
x512_1x4x1_16x16x
32_int8_2
.inc"
#undef CK_TILE_FLATMM_UK_MFMA
#undef CK_TILE_FLATMM_UK_MFMA
:
[
smem_
]
"+r"
(
smem
),
:
[
smem_
]
"+r"
(
smem
),
[
s_loop_cnt
]
"+s"
(
loop_cnt
),
[
s_loop_cnt
]
"+s"
(
loop_cnt
)
[
c0
]
"+v"
(
v_c0
),
// [c0]"+v" (v_c0),
[
c1
]
"+v"
(
v_c1
),
// [c1]"+v" (v_c1),
[
c2
]
"+v"
(
v_c2
),
// [c2]"+v" (v_c2),
[
c3
]
"+v"
(
v_c3
),
// [c3]"+v" (v_c3),
[
c4
]
"+v"
(
v_c4
),
// [c4]"+v" (v_c4),
[
c5
]
"+v"
(
v_c5
),
// [c5]"+v" (v_c5),
[
c6
]
"+v"
(
v_c6
),
// [c6]"+v" (v_c6),
[
c7
]
"+v"
(
v_c7
),
// [c7]"+v" (v_c7),
[
c8
]
"+v"
(
v_c8
),
// [c8]"+v" (v_c8),
[
c9
]
"+v"
(
v_c9
),
// [c9]"+v" (v_c9),
[
c10
]
"+v"
(
v_c10
),
// [c10]"+v"(v_c10),
[
c11
]
"+v"
(
v_c11
),
// [c11]"+v"(v_c11),
[
c12
]
"+v"
(
v_c12
),
// [c12]"+v"(v_c12),
[
c13
]
"+v"
(
v_c13
),
// [c13]"+v"(v_c13),
[
c14
]
"+v"
(
v_c14
),
// [c14]"+v"(v_c14),
[
c15
]
"+v"
(
v_c15
),
// [c15]"+v"(v_c15),
[
c16
]
"+v"
(
v_c16
),
// [c16]"+v"(v_c16),
[
c17
]
"+v"
(
v_c17
),
// [c17]"+v"(v_c17),
[
c18
]
"+v"
(
v_c18
),
// [c18]"+v"(v_c18),
[
c19
]
"+v"
(
v_c19
),
// [c19]"+v"(v_c19),
[
c20
]
"+v"
(
v_c20
),
// [c20]"+v"(v_c20),
[
c21
]
"+v"
(
v_c21
),
// [c21]"+v"(v_c21),
[
c22
]
"+v"
(
v_c22
),
// [c22]"+v"(v_c22),
[
c23
]
"+v"
(
v_c23
),
// [c23]"+v"(v_c23),
[
c24
]
"+v"
(
v_c24
),
// [c24]"+v"(v_c24),
[
c25
]
"+v"
(
v_c25
),
// [c25]"+v"(v_c25),
[
c26
]
"+v"
(
v_c26
),
// [c26]"+v"(v_c26),
[
c27
]
"+v"
(
v_c27
),
// [c27]"+v"(v_c27),
[
c28
]
"+v"
(
v_c28
),
// [c28]"+v"(v_c28),
[
c29
]
"+v"
(
v_c29
),
// [c29]"+v"(v_c29),
[
c30
]
"+v"
(
v_c30
),
// [c30]"+v"(v_c30),
[
c31
]
"+v"
(
v_c31
)
// [c31]"+v"(v_c31)
:
:
[
sld_a_base
]
"n"
(
0
),
[
sld_a_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
[
shfl_base
]
"n"
(
0
),
[
v_sld_y_os
]
"v"
(
sld_y_os
),
[
v_sld_y_os
]
"v"
(
sld_y_os
),
[
v_sfl_sld
]
"v"
(
sfl_sld
),
[
v_sfl_sld
]
"v"
(
sfl_sld
),
...
@@ -491,7 +399,9 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x
...
@@ -491,7 +399,9 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
v_os_b7
]
"v"
(
static_cast
<
index_t
>
(
cached_coords_b
[
number
<
7
>
{}]
*
sizeof
(
BDataType
))),
[
s_tile_os_o
]
"s"
(
tile_stride_o_bytes
),
[
s_tile_os_o
]
"s"
(
tile_stride_o_bytes
),
[
s_tile_os_b_half
]
"s"
(
tile_offset_half_b_bytes
),
[
s_tile_os_b
]
"s"
(
tile_stride_b_bytes
),
[
s_tile_os_b
]
"s"
(
tile_stride_b_bytes
),
[
s_tile_os_dq
]
"s"
(
tile_stride_dq_bytes
),
[
scale_0
]
"v"
(
s0
),
[
scale_0
]
"v"
(
s0
),
[
scale_1
]
"v"
(
s1
),
[
scale_1
]
"v"
(
s1
),
[
v_nan_lo
]
"v"
(
nan_lo
),
[
v_nan_lo
]
"v"
(
nan_lo
),
...
@@ -535,30 +445,45 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x
...
@@ -535,30 +445,45 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a236"
,
"a237"
,
"a238"
,
"a239"
,
"a240"
,
"a241"
,
"a242"
,
"a243"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a244"
,
"a245"
,
"a246"
,
"a247"
,
"a248"
,
"a249"
,
"a250"
,
"a251"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"a252"
,
"a253"
,
"a254"
,
"a255"
,
"s8"
,
"s9"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s38"
,
"s39"
,
"s52"
,
"s86"
,
"s6"
,
"s7"
,
"s8"
,
"s9"
,
"s10"
,
"s11"
,
"s12"
,
"s13"
,
"s14"
,
"s15"
,
"s36"
,
"s37"
,
"s16"
,
"s17"
,
"s18"
,
"s19"
,
"s20"
,
"s21"
,
"s22"
,
"s23"
,
"s24"
,
"s25"
,
"v50"
,
"v54"
,
"v55"
,
"s26"
,
"s27"
,
"s28"
,
"s29"
,
"s30"
,
"s31"
,
"s32"
,
"s33"
,
"s34"
,
"s35"
,
"v64"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"s36"
,
"s37"
,
"s38"
,
"s39"
,
"s40"
,
"s41"
,
"s42"
,
"s43"
,
"s44"
,
"s45"
,
"v72"
,
"v73"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"s46"
,
"s47"
,
"s48"
,
"s49"
,
"s50"
,
"s51"
,
"s52"
,
"s53"
,
"s54"
,
"v80"
,
"v81"
,
"v82"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"s55"
,
"s56"
,
"s57"
,
"s58"
,
"s59"
,
"s60"
,
"s61"
,
"s62"
,
"s63"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"s64"
,
"s65"
,
"s66"
,
"s67"
,
"s68"
,
"s69"
,
"s70"
,
"s71"
,
"s72"
,
"v128"
,
"v129"
,
"v130"
,
"v131"
,
"s73"
,
"s74"
,
"s75"
,
"s76"
,
"s77"
,
"s78"
,
"s79"
,
"s80"
,
// s86 as tmp
"v132"
,
"v133"
,
"v134"
,
"v135"
,
"v136"
,
"v137"
,
"v138"
,
"v139"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v140"
,
"v141"
,
"v142"
,
"v143"
,
"v144"
,
"v145"
,
"v146"
,
"v147"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v148"
,
"v149"
,
"v150"
,
"v151"
,
"v152"
,
"v153"
,
"v154"
,
"v155"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
,
"v156"
,
"v157"
,
"v158"
,
"v159"
,
"v160"
,
"v161"
,
"v162"
,
"v163"
,
"v29"
,
"v30"
,
"v31"
,
"v32"
,
"v33"
,
"v34"
,
"v35"
,
"v36"
,
"v37"
,
"v164"
,
"v165"
,
"v166"
,
"v167"
,
"v168"
,
"v169"
,
"v170"
,
"v171"
,
"v38"
,
"v39"
,
"v40"
,
"v41"
,
"v42"
,
"v43"
,
"v44"
,
"v45"
,
"v46"
,
"v172"
,
"v173"
,
"v174"
,
"v175"
,
"v176"
,
"v177"
,
"v178"
,
"v179"
,
"v47"
,
"v48"
,
"v49"
,
"v50"
,
"v51"
,
"v52"
,
"v53"
,
"v54"
,
"v55"
,
"v180"
,
"v181"
,
"v182"
,
"v183"
,
"v184"
,
"v185"
,
"v186"
,
"v187"
,
"v56"
,
"v57"
,
"v58"
,
"v59"
,
"v60"
,
"v61"
,
"v62"
,
"v63"
,
"v64"
,
"v188"
,
"v189"
,
"v190"
,
"v191"
,
"v192"
,
"v193"
,
"v194"
,
"v195"
,
"v65"
,
"v66"
,
"v67"
,
"v68"
,
"v69"
,
"v70"
,
"v71"
,
"v72"
,
"v73"
,
"v196"
,
"v197"
,
"v198"
,
"v199"
,
"v200"
,
"v201"
,
"v202"
,
"v203"
,
"v74"
,
"v75"
,
"v76"
,
"v77"
,
"v78"
,
"v79"
,
"v80"
,
"v81"
,
"v82"
,
"v204"
,
"v205"
,
"v206"
,
"v207"
,
"v208"
,
"v209"
,
"v210"
,
"v211"
,
"v83"
,
"v84"
,
"v85"
,
"v86"
,
"v87"
,
"v88"
,
"v89"
,
"v90"
,
"v91"
,
"v212"
,
"v213"
,
"v214"
,
"v215"
,
"v216"
,
"v217"
,
"v218"
,
"v219"
,
"v92"
,
"v93"
,
"v94"
,
"v95"
,
"v96"
,
"v97"
,
"v98"
,
"v99"
,
"v100"
,
"v220"
,
"v221"
,
"v222"
,
"v223"
,
"v224"
,
"v225"
,
"v226"
,
"v227"
,
"v101"
,
"v102"
,
"v103"
,
"v104"
,
"v105"
,
"v106"
,
"v107"
,
"v108"
,
"v228"
,
"v229"
,
"v230"
,
"v231"
,
"v232"
,
"v233"
,
"v234"
,
"v235"
,
"v109"
,
"v110"
,
"v111"
,
"v112"
,
"v113"
,
"v114"
,
"v115"
,
"v116"
,
"v236"
,
"v237"
,
"v238"
,
"v239"
,
"v240"
,
"v241"
,
"v242"
,
"v243"
,
"v117"
,
"v118"
,
"v119"
,
"v120"
,
"v121"
,
"v122"
,
"v123"
,
"v124"
,
"v244"
,
"v245"
,
"v246"
,
"v247"
,
"v248"
,
"v249"
,
"v250"
,
"v251"
,
"v125"
,
"v126"
,
"v127"
,
"v128"
,
"v129"
,
"v130"
,
"v131"
,
"v132"
,
"v252"
,
"v253"
,
"v254"
,
"v255"
"v133"
,
"v134"
,
"v135"
,
"v136"
,
"v137"
,
"v138"
,
"v139"
,
"v140"
,
"v141"
,
"v142"
,
"v143"
,
"v144"
,
"v145"
,
"v146"
,
"v147"
,
"v148"
,
"v149"
,
"v150"
,
"v151"
,
"v152"
,
"v153"
,
"v154"
,
"v155"
,
"v156"
,
"v157"
,
"v158"
,
"v159"
,
"v160"
,
"v161"
,
"v162"
,
"v163"
,
"v164"
,
"v165"
,
"v166"
,
"v167"
,
"v168"
,
"v169"
,
"v170"
,
"v171"
,
"v172"
,
"v173"
,
"v174"
,
"v175"
,
"v176"
,
"v177"
,
"v178"
,
"v179"
,
"v180"
,
"v181"
,
"v182"
,
"v183"
,
"v184"
,
"v185"
,
"v186"
,
"v187"
,
"v188"
,
"v189"
,
"v190"
,
"v191"
,
"v192"
,
"v193"
,
"v194"
,
"v195"
,
"v196"
,
"v197"
,
"v198"
,
"v199"
,
"v200"
,
"v201"
,
"v202"
,
"v203"
,
"v204"
,
"v205"
,
"v206"
,
"v207"
,
"v208"
,
"v209"
,
"v210"
,
"v211"
,
"v212"
,
"v213"
,
"v214"
,
"v215"
,
"v216"
,
"v217"
,
"v218"
,
"v219"
,
"v220"
,
"v221"
,
"v222"
,
"v223"
,
"v224"
,
"v225"
,
"v226"
,
"v227"
,
"v228"
,
"v229"
,
"v230"
,
"v231"
,
"v232"
,
"v233"
,
"v234"
,
"v235"
,
"v236"
,
"v237"
,
"v238"
,
"v239"
,
"v240"
,
"v241"
,
"v242"
,
"v243"
,
"v244"
,
"v245"
,
"v246"
,
"v247"
,
"v248"
,
"v249"
,
"v250"
,
"v251"
,
"v252"
,
"v253"
,
"v254"
,
"v255"
);
);
#pragma clang diagnostic pop
#pragma clang diagnostic pop
// clang-format on
// clang-format on
...
...
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_1.inc
0 → 100644
View file @
a759277d
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_INT8
# define _UK_MFMA_ "v_mfma_i32_16x16x32_i8"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_perm_b32 "
y_
", v55, v54, s52
\n
"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
" v_pack_b32_f16 "
y_
", v54, v55
\n
"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
" s_mov_b32 s8, %[s_res_o0]
\n
"
" s_mov_b32 s9, %[s_res_o1]
\n
"
" s_mov_b32 s12, %[s_res_b0]
\n
"
" s_mov_b32 s13, %[s_res_b1]
\n
"
" s_mov_b32 s14, %[s_res_b2]
\n
"
" s_mov_b32 s15, %[s_res_b3]
\n
"
" s_waitcnt vmcnt(24)
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v128, v128
\n
"
" v_mul_f32 v55, v129, v129
\n
"
" v_mul_f32 v56, v130, v130
\n
"
" v_mul_f32 v57, v131, v131
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v128
\n
"
" v_mul_f32 v55, v55, v129
\n
"
" v_mul_f32 v56, v56, v130
\n
"
" v_mul_f32 v57, v57, v131
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v128, v128, v54
\n
"
" v_mul_f32 v129, v129, v55
\n
"
" v_mul_f32 v130, v130, v56
\n
"
" v_mul_f32 v131, v131, v57
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v132, v132
\n
"
" v_mul_f32 v55, v133, v133
\n
"
" v_mul_f32 v56, v134, v134
\n
"
" v_mul_f32 v57, v135, v135
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v132
\n
"
" v_mul_f32 v55, v55, v133
\n
"
" v_mul_f32 v56, v56, v134
\n
"
" v_mul_f32 v57, v57, v135
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v132, v132, v54
\n
"
" v_mul_f32 v133, v133, v55
\n
"
" v_mul_f32 v134, v134, v56
\n
"
" v_mul_f32 v135, v135, v57
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v136, v136
\n
"
" v_mul_f32 v55, v137, v137
\n
"
" v_mul_f32 v56, v138, v138
\n
"
" v_mul_f32 v57, v139, v139
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v136
\n
"
" v_mul_f32 v55, v55, v137
\n
"
" v_mul_f32 v56, v56, v138
\n
"
" v_mul_f32 v57, v57, v139
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v136, v136, v54
\n
"
" v_mul_f32 v137, v137, v55
\n
"
" v_mul_f32 v138, v138, v56
\n
"
" v_mul_f32 v139, v139, v57
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v140, v140
\n
"
" v_mul_f32 v55, v141, v141
\n
"
" v_mul_f32 v56, v142, v142
\n
"
" v_mul_f32 v57, v143, v143
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v140
\n
"
" v_mul_f32 v55, v55, v141
\n
"
" v_mul_f32 v56, v56, v142
\n
"
" v_mul_f32 v57, v57, v143
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v140, v140, v54
\n
"
" v_mul_f32 v141, v141, v55
\n
"
" v_mul_f32 v142, v142, v56
\n
"
" v_mul_f32 v143, v143, v57
\n
"
" s_waitcnt vmcnt(24)
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v144, v144
\n
"
" v_mul_f32 v55, v145, v145
\n
"
" v_mul_f32 v56, v146, v146
\n
"
" v_mul_f32 v57, v147, v147
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v144
\n
"
" v_mul_f32 v55, v55, v145
\n
"
" v_mul_f32 v56, v56, v146
\n
"
" v_mul_f32 v57, v57, v147
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v144, v144, v54
\n
"
" v_mul_f32 v145, v145, v55
\n
"
" v_mul_f32 v146, v146, v56
\n
"
" v_mul_f32 v147, v147, v57
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v148, v148
\n
"
" v_mul_f32 v55, v149, v149
\n
"
" v_mul_f32 v56, v150, v150
\n
"
" v_mul_f32 v57, v151, v151
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v148
\n
"
" v_mul_f32 v55, v55, v149
\n
"
" v_mul_f32 v56, v56, v150
\n
"
" v_mul_f32 v57, v57, v151
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v148, v148, v54
\n
"
" v_mul_f32 v149, v149, v55
\n
"
" v_mul_f32 v150, v150, v56
\n
"
" v_mul_f32 v151, v151, v57
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v152, v152
\n
"
" v_mul_f32 v55, v153, v153
\n
"
" v_mul_f32 v56, v154, v154
\n
"
" v_mul_f32 v57, v155, v155
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v152
\n
"
" v_mul_f32 v55, v55, v153
\n
"
" v_mul_f32 v56, v56, v154
\n
"
" v_mul_f32 v57, v57, v155
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v152, v152, v54
\n
"
" v_mul_f32 v153, v153, v55
\n
"
" v_mul_f32 v154, v154, v56
\n
"
" v_mul_f32 v155, v155, v57
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v156, v156
\n
"
" v_mul_f32 v55, v157, v157
\n
"
" v_mul_f32 v56, v158, v158
\n
"
" v_mul_f32 v57, v159, v159
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v156
\n
"
" v_mul_f32 v55, v55, v157
\n
"
" v_mul_f32 v56, v56, v158
\n
"
" v_mul_f32 v57, v57, v159
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" s_add_u32 s12, %[s_tile_os_b_half], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v156, v156, v54
\n
"
" v_mul_f32 v157, v157, v55
\n
"
" v_mul_f32 v158, v158, v56
\n
"
" v_mul_f32 v159, v159, v57
\n
"
" s_waitcnt vmcnt(24)
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v160, v160
\n
"
" v_mul_f32 v55, v161, v161
\n
"
" v_mul_f32 v56, v162, v162
\n
"
" v_mul_f32 v57, v163, v163
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v160
\n
"
" v_mul_f32 v55, v55, v161
\n
"
" v_mul_f32 v56, v56, v162
\n
"
" v_mul_f32 v57, v57, v163
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v160, v160, v54
\n
"
" v_mul_f32 v161, v161, v55
\n
"
" v_mul_f32 v162, v162, v56
\n
"
" v_mul_f32 v163, v163, v57
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v164, v164
\n
"
" v_mul_f32 v55, v165, v165
\n
"
" v_mul_f32 v56, v166, v166
\n
"
" v_mul_f32 v57, v167, v167
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v164
\n
"
" v_mul_f32 v55, v55, v165
\n
"
" v_mul_f32 v56, v56, v166
\n
"
" v_mul_f32 v57, v57, v167
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v164, v164, v54
\n
"
" v_mul_f32 v165, v165, v55
\n
"
" v_mul_f32 v166, v166, v56
\n
"
" v_mul_f32 v167, v167, v57
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v168, v168
\n
"
" v_mul_f32 v55, v169, v169
\n
"
" v_mul_f32 v56, v170, v170
\n
"
" v_mul_f32 v57, v171, v171
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v168
\n
"
" v_mul_f32 v55, v55, v169
\n
"
" v_mul_f32 v56, v56, v170
\n
"
" v_mul_f32 v57, v57, v171
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v168, v168, v54
\n
"
" v_mul_f32 v169, v169, v55
\n
"
" v_mul_f32 v170, v170, v56
\n
"
" v_mul_f32 v171, v171, v57
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v172, v172
\n
"
" v_mul_f32 v55, v173, v173
\n
"
" v_mul_f32 v56, v174, v174
\n
"
" v_mul_f32 v57, v175, v175
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v172
\n
"
" v_mul_f32 v55, v55, v173
\n
"
" v_mul_f32 v56, v56, v174
\n
"
" v_mul_f32 v57, v57, v175
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v172, v172, v54
\n
"
" v_mul_f32 v173, v173, v55
\n
"
" v_mul_f32 v174, v174, v56
\n
"
" v_mul_f32 v175, v175, v57
\n
"
" s_waitcnt vmcnt(24)
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v176, v176
\n
"
" v_mul_f32 v55, v177, v177
\n
"
" v_mul_f32 v56, v178, v178
\n
"
" v_mul_f32 v57, v179, v179
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v176
\n
"
" v_mul_f32 v55, v55, v177
\n
"
" v_mul_f32 v56, v56, v178
\n
"
" v_mul_f32 v57, v57, v179
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v176, v176, v54
\n
"
" v_mul_f32 v177, v177, v55
\n
"
" v_mul_f32 v178, v178, v56
\n
"
" v_mul_f32 v179, v179, v57
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v180, v180
\n
"
" v_mul_f32 v55, v181, v181
\n
"
" v_mul_f32 v56, v182, v182
\n
"
" v_mul_f32 v57, v183, v183
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v180
\n
"
" v_mul_f32 v55, v55, v181
\n
"
" v_mul_f32 v56, v56, v182
\n
"
" v_mul_f32 v57, v57, v183
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v180, v180, v54
\n
"
" v_mul_f32 v181, v181, v55
\n
"
" v_mul_f32 v182, v182, v56
\n
"
" v_mul_f32 v183, v183, v57
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mul_f32 v54, v184, v184
\n
"
" v_mul_f32 v55, v185, v185
\n
"
" v_mul_f32 v56, v186, v186
\n
"
" v_mul_f32 v57, v187, v187
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v184
\n
"
" v_mul_f32 v55, v55, v185
\n
"
" v_mul_f32 v56, v56, v186
\n
"
" v_mul_f32 v57, v57, v187
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v184, v184, v54
\n
"
" v_mul_f32 v185, v185, v55
\n
"
" v_mul_f32 v186, v186, v56
\n
"
" v_mul_f32 v187, v187, v57
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mul_f32 v54, v188, v188
\n
"
" v_mul_f32 v55, v189, v189
\n
"
" v_mul_f32 v56, v190, v190
\n
"
" v_mul_f32 v57, v191, v191
\n
"
" v_fma_f32 v54, v54, s77, v1
\n
"
" v_fma_f32 v55, v55, s77, v1
\n
"
" v_fma_f32 v56, v56, s77, v1
\n
"
" v_fma_f32 v57, v57, s77, v1
\n
"
" v_mul_f32 v54, v54, v188
\n
"
" v_mul_f32 v55, v55, v189
\n
"
" v_mul_f32 v56, v56, v190
\n
"
" v_mul_f32 v57, v57, v191
\n
"
" v_mul_f32 v54, v54, s6
\n
"
" v_mul_f32 v55, v55, s6
\n
"
" v_mul_f32 v56, v56, s6
\n
"
" v_mul_f32 v57, v57, s6
\n
"
" v_exp_f32 v54, v54
\n
"
" v_exp_f32 v55, v55
\n
"
" v_exp_f32 v56, v56
\n
"
" v_exp_f32 v57, v57
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" v_add_f32 v54, v54, 1.0
\n
"
" v_add_f32 v55, v55, 1.0
\n
"
" v_add_f32 v56, v56, 1.0
\n
"
" v_add_f32 v57, v57, 1.0
\n
"
" v_rcp_f32 v54, v54
\n
"
" v_rcp_f32 v55, v55
\n
"
" v_rcp_f32 v56, v56
\n
"
" v_rcp_f32 v57, v57
\n
"
" v_mul_f32 v188, v188, v54
\n
"
" v_mul_f32 v189, v189, v55
\n
"
" v_mul_f32 v190, v190, v56
\n
"
" v_mul_f32 v191, v191, v57
\n
"
" v_mul_f32 v128, v18, v128 row_newbcast:0
\n
"
" v_mul_f32 v129, v18, v129 row_newbcast:1
\n
"
" v_mul_f32 v130, v18, v130 row_newbcast:2
\n
"
" v_mul_f32 v131, v18, v131 row_newbcast:3
\n
"
" v_mul_f32 v132, v18, v132 row_newbcast:0
\n
"
" v_mul_f32 v133, v18, v133 row_newbcast:1
\n
"
" v_mul_f32 v134, v18, v134 row_newbcast:2
\n
"
" v_mul_f32 v135, v18, v135 row_newbcast:3
\n
"
" v_mul_f32 v136, v18, v136 row_newbcast:4
\n
"
" v_mul_f32 v137, v18, v137 row_newbcast:5
\n
"
" v_mul_f32 v138, v18, v138 row_newbcast:6
\n
"
" v_mul_f32 v139, v18, v139 row_newbcast:7
\n
"
" v_mul_f32 v140, v18, v140 row_newbcast:4
\n
"
" v_mul_f32 v141, v18, v141 row_newbcast:5
\n
"
" v_mul_f32 v142, v18, v142 row_newbcast:6
\n
"
" v_mul_f32 v143, v18, v143 row_newbcast:7
\n
"
" v_mul_f32 v144, v18, v144 row_newbcast:8
\n
"
" v_mul_f32 v145, v18, v145 row_newbcast:9
\n
"
" v_mul_f32 v146, v18, v146 row_newbcast:10
\n
"
" v_mul_f32 v147, v18, v147 row_newbcast:11
\n
"
" v_mul_f32 v148, v18, v148 row_newbcast:8
\n
"
" v_mul_f32 v149, v18, v149 row_newbcast:9
\n
"
" v_mul_f32 v150, v18, v150 row_newbcast:10
\n
"
" v_mul_f32 v151, v18, v151 row_newbcast:11
\n
"
" v_mul_f32 v152, v18, v152 row_newbcast:12
\n
"
" v_mul_f32 v153, v18, v153 row_newbcast:13
\n
"
" v_mul_f32 v154, v18, v154 row_newbcast:14
\n
"
" v_mul_f32 v155, v18, v155 row_newbcast:15
\n
"
" v_mul_f32 v156, v18, v156 row_newbcast:12
\n
"
" v_mul_f32 v157, v18, v157 row_newbcast:13
\n
"
" v_mul_f32 v158, v18, v158 row_newbcast:14
\n
"
" v_mul_f32 v159, v18, v159 row_newbcast:15
\n
"
" v_mul_f32 v160, v19, v160 row_newbcast:0
\n
"
" v_mul_f32 v161, v19, v161 row_newbcast:1
\n
"
" v_mul_f32 v162, v19, v162 row_newbcast:2
\n
"
" v_mul_f32 v163, v19, v163 row_newbcast:3
\n
"
" v_mul_f32 v164, v19, v164 row_newbcast:0
\n
"
" v_mul_f32 v165, v19, v165 row_newbcast:1
\n
"
" v_mul_f32 v166, v19, v166 row_newbcast:2
\n
"
" v_mul_f32 v167, v19, v167 row_newbcast:3
\n
"
" v_mul_f32 v168, v19, v168 row_newbcast:4
\n
"
" v_mul_f32 v169, v19, v169 row_newbcast:5
\n
"
" v_mul_f32 v170, v19, v170 row_newbcast:6
\n
"
" v_mul_f32 v171, v19, v171 row_newbcast:7
\n
"
" v_mul_f32 v172, v19, v172 row_newbcast:4
\n
"
" v_mul_f32 v173, v19, v173 row_newbcast:5
\n
"
" v_mul_f32 v174, v19, v174 row_newbcast:6
\n
"
" v_mul_f32 v175, v19, v175 row_newbcast:7
\n
"
" v_mul_f32 v176, v19, v176 row_newbcast:8
\n
"
" v_mul_f32 v177, v19, v177 row_newbcast:9
\n
"
" v_mul_f32 v178, v19, v178 row_newbcast:10
\n
"
" v_mul_f32 v179, v19, v179 row_newbcast:11
\n
"
" v_mul_f32 v180, v19, v180 row_newbcast:8
\n
"
" v_mul_f32 v181, v19, v181 row_newbcast:9
\n
"
" v_mul_f32 v182, v19, v182 row_newbcast:10
\n
"
" v_mul_f32 v183, v19, v183 row_newbcast:11
\n
"
" v_mul_f32 v184, v19, v184 row_newbcast:12
\n
"
" v_mul_f32 v185, v19, v185 row_newbcast:13
\n
"
" v_mul_f32 v186, v19, v186 row_newbcast:14
\n
"
" v_mul_f32 v187, v19, v187 row_newbcast:15
\n
"
" v_mul_f32 v188, v19, v188 row_newbcast:12
\n
"
" v_mul_f32 v189, v19, v189 row_newbcast:13
\n
"
" v_mul_f32 v190, v19, v190 row_newbcast:14
\n
"
" v_mul_f32 v191, v19, v191 row_newbcast:15
\n
"
" buffer_load_dword v12, v5, s[16:19], 0 offen
\n
"
" v_mov_b32 v22, 0x358637bd
\n
"
" v_mov_b32 v23, 0x358637bd
\n
"
" v_max3_f32 v22, abs(v128), abs(v129), v22
\n
"
" v_max3_f32 v22, abs(v130), abs(v131), v22
\n
"
" v_max3_f32 v23, abs(v132), abs(v133), v23
\n
"
" v_max3_f32 v23, abs(v134), abs(v135), v23
\n
"
" v_max3_f32 v22, abs(v136), abs(v137), v22
\n
"
" v_max3_f32 v22, abs(v138), abs(v139), v22
\n
"
" v_max3_f32 v23, abs(v140), abs(v141), v23
\n
"
" v_max3_f32 v23, abs(v142), abs(v143), v23
\n
"
" v_max3_f32 v22, abs(v144), abs(v145), v22
\n
"
" v_max3_f32 v22, abs(v146), abs(v147), v22
\n
"
" v_max3_f32 v23, abs(v148), abs(v149), v23
\n
"
" v_max3_f32 v23, abs(v150), abs(v151), v23
\n
"
" v_max3_f32 v22, abs(v152), abs(v153), v22
\n
"
" v_max3_f32 v22, abs(v154), abs(v155), v22
\n
"
" v_max3_f32 v23, abs(v156), abs(v157), v23
\n
"
" v_max3_f32 v23, abs(v158), abs(v159), v23
\n
"
" v_max3_f32 v22, abs(v160), abs(v161), v22
\n
"
" v_max3_f32 v22, abs(v162), abs(v163), v22
\n
"
" v_max3_f32 v23, abs(v164), abs(v165), v23
\n
"
" v_max3_f32 v23, abs(v166), abs(v167), v23
\n
"
" v_max3_f32 v22, abs(v168), abs(v169), v22
\n
"
" v_max3_f32 v22, abs(v170), abs(v171), v22
\n
"
" v_max3_f32 v23, abs(v172), abs(v173), v23
\n
"
" v_max3_f32 v23, abs(v174), abs(v175), v23
\n
"
" v_max3_f32 v22, abs(v176), abs(v177), v22
\n
"
" v_max3_f32 v22, abs(v178), abs(v179), v22
\n
"
" v_max3_f32 v23, abs(v180), abs(v181), v23
\n
"
" v_max3_f32 v23, abs(v182), abs(v183), v23
\n
"
" v_max3_f32 v22, abs(v184), abs(v185), v22
\n
"
" v_max3_f32 v22, abs(v186), abs(v187), v22
\n
"
" v_max3_f32 v23, abs(v188), abs(v189), v23
\n
"
" v_max3_f32 v23, abs(v190), abs(v191), v23
\n
"
" v_lshlrev_b32 v54, 3, v0
\n
"
" s_mul_i32 s60, 0x00000200, s7
\n
"
" v_add_u32 v54, s60, v54
\n
"
" ds_write_b64 v54, v[22:23] offset:16640
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" v_and_b32 v54, 15, v0
\n
"
" v_lshlrev_b32 v54, 3, v54
\n
"
" ds_read_b64 v[96:97], v54 offset:16640
\n
"
" ds_read_b64 v[98:99], v54 offset:16768
\n
"
" ds_read_b64 v[100:101], v54 offset:16896
\n
"
" ds_read_b64 v[102:103], v54 offset:17024
\n
"
" ds_read_b64 v[104:105], v54 offset:17152
\n
"
" ds_read_b64 v[106:107], v54 offset:17280
\n
"
" ds_read_b64 v[108:109], v54 offset:17408
\n
"
" ds_read_b64 v[110:111], v54 offset:17536
\n
"
" ds_read_b64 v[112:113], v54 offset:17664
\n
"
" ds_read_b64 v[114:115], v54 offset:17792
\n
"
" ds_read_b64 v[116:117], v54 offset:17920
\n
"
" ds_read_b64 v[118:119], v54 offset:18048
\n
"
" ds_read_b64 v[120:121], v54 offset:18176
\n
"
" ds_read_b64 v[122:123], v54 offset:18304
\n
"
" ds_read_b64 v[124:125], v54 offset:18432
\n
"
" ds_read_b64 v[126:127], v54 offset:18560
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" v_max3_f32 v22, abs(v96), abs(v98), v22
\n
"
" v_max3_f32 v23, abs(v97), abs(v99), v23
\n
"
" v_max3_f32 v22, abs(v100), abs(v102), v22
\n
"
" v_max3_f32 v23, abs(v101), abs(v103), v23
\n
"
" v_max3_f32 v22, abs(v104), abs(v106), v22
\n
"
" v_max3_f32 v23, abs(v105), abs(v107), v23
\n
"
" v_max3_f32 v22, abs(v108), abs(v110), v22
\n
"
" v_max3_f32 v23, abs(v109), abs(v111), v23
\n
"
" v_max3_f32 v22, abs(v112), abs(v114), v22
\n
"
" v_max3_f32 v23, abs(v113), abs(v115), v23
\n
"
" v_max3_f32 v22, abs(v116), abs(v118), v22
\n
"
" v_max3_f32 v23, abs(v117), abs(v119), v23
\n
"
" v_max3_f32 v22, abs(v120), abs(v122), v22
\n
"
" v_max3_f32 v23, abs(v121), abs(v123), v23
\n
"
" v_max3_f32 v22, abs(v124), abs(v126), v22
\n
"
" v_max3_f32 v23, abs(v125), abs(v127), v23
\n
"
" v_rcp_f32 v22, v22
\n
"
" v_rcp_f32 v23, v23
\n
"
" v_mul_f32 v22, 0x42fe0000, v22
\n
"
" v_mul_f32 v23, 0x42fe0000, v23
\n
"
" v_mul_f32 v128, v22, v128
\n
"
" v_mul_f32 v129, v22, v129
\n
"
" v_mul_f32 v130, v22, v130
\n
"
" v_mul_f32 v131, v22, v131
\n
"
" v_cvt_i32_f32 v128, v128
\n
"
" v_cvt_i32_f32 v129, v129
\n
"
" v_cvt_i32_f32 v130, v130
\n
"
" v_cvt_i32_f32 v131, v131
\n
"
" v_perm_b32 v128, v129, v128, s53
\n
"
" v_perm_b32 v128, v130, v128, s54
\n
"
" v_perm_b32 v128, v131, v128, s55
\n
"
" v_mul_f32 v132, v23, v132
\n
"
" v_mul_f32 v133, v23, v133
\n
"
" v_mul_f32 v134, v23, v134
\n
"
" v_mul_f32 v135, v23, v135
\n
"
" v_cvt_i32_f32 v132, v132
\n
"
" v_cvt_i32_f32 v133, v133
\n
"
" v_cvt_i32_f32 v134, v134
\n
"
" v_cvt_i32_f32 v135, v135
\n
"
" v_perm_b32 v129, v133, v132, s53
\n
"
" v_perm_b32 v129, v134, v129, s54
\n
"
" v_perm_b32 v129, v135, v129, s55
\n
"
" v_mul_f32 v136, v22, v136
\n
"
" v_mul_f32 v137, v22, v137
\n
"
" v_mul_f32 v138, v22, v138
\n
"
" v_mul_f32 v139, v22, v139
\n
"
" v_cvt_i32_f32 v136, v136
\n
"
" v_cvt_i32_f32 v137, v137
\n
"
" v_cvt_i32_f32 v138, v138
\n
"
" v_cvt_i32_f32 v139, v139
\n
"
" v_perm_b32 v130, v137, v136, s53
\n
"
" v_perm_b32 v130, v138, v130, s54
\n
"
" v_perm_b32 v130, v139, v130, s55
\n
"
" v_mul_f32 v140, v23, v140
\n
"
" v_mul_f32 v141, v23, v141
\n
"
" v_mul_f32 v142, v23, v142
\n
"
" v_mul_f32 v143, v23, v143
\n
"
" v_cvt_i32_f32 v140, v140
\n
"
" v_cvt_i32_f32 v141, v141
\n
"
" v_cvt_i32_f32 v142, v142
\n
"
" v_cvt_i32_f32 v143, v143
\n
"
" v_perm_b32 v131, v141, v140, s53
\n
"
" v_perm_b32 v131, v142, v131, s54
\n
"
" v_perm_b32 v131, v143, v131, s55
\n
"
" v_mul_f32 v144, v22, v144
\n
"
" v_mul_f32 v145, v22, v145
\n
"
" v_mul_f32 v146, v22, v146
\n
"
" v_mul_f32 v147, v22, v147
\n
"
" v_cvt_i32_f32 v144, v144
\n
"
" v_cvt_i32_f32 v145, v145
\n
"
" v_cvt_i32_f32 v146, v146
\n
"
" v_cvt_i32_f32 v147, v147
\n
"
" v_perm_b32 v132, v145, v144, s53
\n
"
" v_perm_b32 v132, v146, v132, s54
\n
"
" v_perm_b32 v132, v147, v132, s55
\n
"
" v_mul_f32 v148, v23, v148
\n
"
" v_mul_f32 v149, v23, v149
\n
"
" v_mul_f32 v150, v23, v150
\n
"
" v_mul_f32 v151, v23, v151
\n
"
" v_cvt_i32_f32 v148, v148
\n
"
" v_cvt_i32_f32 v149, v149
\n
"
" v_cvt_i32_f32 v150, v150
\n
"
" v_cvt_i32_f32 v151, v151
\n
"
" v_perm_b32 v133, v149, v148, s53
\n
"
" v_perm_b32 v133, v150, v133, s54
\n
"
" v_perm_b32 v133, v151, v133, s55
\n
"
" v_mul_f32 v152, v22, v152
\n
"
" v_mul_f32 v153, v22, v153
\n
"
" v_mul_f32 v154, v22, v154
\n
"
" v_mul_f32 v155, v22, v155
\n
"
" v_cvt_i32_f32 v152, v152
\n
"
" v_cvt_i32_f32 v153, v153
\n
"
" v_cvt_i32_f32 v154, v154
\n
"
" v_cvt_i32_f32 v155, v155
\n
"
" v_perm_b32 v134, v153, v152, s53
\n
"
" v_perm_b32 v134, v154, v134, s54
\n
"
" v_perm_b32 v134, v155, v134, s55
\n
"
" v_mul_f32 v156, v23, v156
\n
"
" v_mul_f32 v157, v23, v157
\n
"
" v_mul_f32 v158, v23, v158
\n
"
" v_mul_f32 v159, v23, v159
\n
"
" v_cvt_i32_f32 v156, v156
\n
"
" v_cvt_i32_f32 v157, v157
\n
"
" v_cvt_i32_f32 v158, v158
\n
"
" v_cvt_i32_f32 v159, v159
\n
"
" v_perm_b32 v135, v157, v156, s53
\n
"
" v_perm_b32 v135, v158, v135, s54
\n
"
" v_perm_b32 v135, v159, v135, s55
\n
"
" v_mul_f32 v160, v22, v160
\n
"
" v_mul_f32 v161, v22, v161
\n
"
" v_mul_f32 v162, v22, v162
\n
"
" v_mul_f32 v163, v22, v163
\n
"
" v_cvt_i32_f32 v160, v160
\n
"
" v_cvt_i32_f32 v161, v161
\n
"
" v_cvt_i32_f32 v162, v162
\n
"
" v_cvt_i32_f32 v163, v163
\n
"
" v_perm_b32 v136, v161, v160, s53
\n
"
" v_perm_b32 v136, v162, v136, s54
\n
"
" v_perm_b32 v136, v163, v136, s55
\n
"
" v_mul_f32 v164, v23, v164
\n
"
" v_mul_f32 v165, v23, v165
\n
"
" v_mul_f32 v166, v23, v166
\n
"
" v_mul_f32 v167, v23, v167
\n
"
" v_cvt_i32_f32 v164, v164
\n
"
" v_cvt_i32_f32 v165, v165
\n
"
" v_cvt_i32_f32 v166, v166
\n
"
" v_cvt_i32_f32 v167, v167
\n
"
" v_perm_b32 v137, v165, v164, s53
\n
"
" v_perm_b32 v137, v166, v137, s54
\n
"
" v_perm_b32 v137, v167, v137, s55
\n
"
" v_mul_f32 v168, v22, v168
\n
"
" v_mul_f32 v169, v22, v169
\n
"
" v_mul_f32 v170, v22, v170
\n
"
" v_mul_f32 v171, v22, v171
\n
"
" v_cvt_i32_f32 v168, v168
\n
"
" v_cvt_i32_f32 v169, v169
\n
"
" v_cvt_i32_f32 v170, v170
\n
"
" v_cvt_i32_f32 v171, v171
\n
"
" v_perm_b32 v138, v169, v168, s53
\n
"
" v_perm_b32 v138, v170, v138, s54
\n
"
" v_perm_b32 v138, v171, v138, s55
\n
"
" v_mul_f32 v172, v23, v172
\n
"
" v_mul_f32 v173, v23, v173
\n
"
" v_mul_f32 v174, v23, v174
\n
"
" v_mul_f32 v175, v23, v175
\n
"
" v_cvt_i32_f32 v172, v172
\n
"
" v_cvt_i32_f32 v173, v173
\n
"
" v_cvt_i32_f32 v174, v174
\n
"
" v_cvt_i32_f32 v175, v175
\n
"
" v_perm_b32 v139, v173, v172, s53
\n
"
" v_perm_b32 v139, v174, v139, s54
\n
"
" v_perm_b32 v139, v175, v139, s55
\n
"
" v_mul_f32 v176, v22, v176
\n
"
" v_mul_f32 v177, v22, v177
\n
"
" v_mul_f32 v178, v22, v178
\n
"
" v_mul_f32 v179, v22, v179
\n
"
" v_cvt_i32_f32 v176, v176
\n
"
" v_cvt_i32_f32 v177, v177
\n
"
" v_cvt_i32_f32 v178, v178
\n
"
" v_cvt_i32_f32 v179, v179
\n
"
" v_perm_b32 v140, v177, v176, s53
\n
"
" v_perm_b32 v140, v178, v140, s54
\n
"
" v_perm_b32 v140, v179, v140, s55
\n
"
" v_mul_f32 v180, v23, v180
\n
"
" v_mul_f32 v181, v23, v181
\n
"
" v_mul_f32 v182, v23, v182
\n
"
" v_mul_f32 v183, v23, v183
\n
"
" v_cvt_i32_f32 v180, v180
\n
"
" v_cvt_i32_f32 v181, v181
\n
"
" v_cvt_i32_f32 v182, v182
\n
"
" v_cvt_i32_f32 v183, v183
\n
"
" v_perm_b32 v141, v181, v180, s53
\n
"
" v_perm_b32 v141, v182, v141, s54
\n
"
" v_perm_b32 v141, v183, v141, s55
\n
"
" v_mul_f32 v184, v22, v184
\n
"
" v_mul_f32 v185, v22, v185
\n
"
" v_mul_f32 v186, v22, v186
\n
"
" v_mul_f32 v187, v22, v187
\n
"
" v_cvt_i32_f32 v184, v184
\n
"
" v_cvt_i32_f32 v185, v185
\n
"
" v_cvt_i32_f32 v186, v186
\n
"
" v_cvt_i32_f32 v187, v187
\n
"
" v_perm_b32 v142, v185, v184, s53
\n
"
" v_perm_b32 v142, v186, v142, s54
\n
"
" v_perm_b32 v142, v187, v142, s55
\n
"
" v_mul_f32 v188, v23, v188
\n
"
" v_mul_f32 v189, v23, v189
\n
"
" v_mul_f32 v190, v23, v190
\n
"
" v_mul_f32 v191, v23, v191
\n
"
" v_cvt_i32_f32 v188, v188
\n
"
" v_cvt_i32_f32 v189, v189
\n
"
" v_cvt_i32_f32 v190, v190
\n
"
" v_cvt_i32_f32 v191, v191
\n
"
" v_perm_b32 v143, v189, v188, s53
\n
"
" v_perm_b32 v143, v190, v143, s54
\n
"
" v_perm_b32 v143, v191, v143, s55
\n
"
" v_rcp_f32 v24, v22
\n
"
" v_rcp_f32 v25, v23
\n
"
" v_lshrrev_b32 v54, 5, v0
\n
"
" v_lshlrev_b32 v55, 5, v54
\n
"
" v_and_b32 v54, 31, v0
\n
"
" v_lshrrev_b32 v56, 4, v54
\n
"
" v_add_u32 v55, v56, v55
\n
"
" v_and_b32 v54, 15, v0
\n
"
" v_lshlrev_b32 v54, 1, v54
\n
"
" v_add_u32 v55, v54, v55
\n
"
" v_lshlrev_b32 v54, 2, v55
\n
"
" s_mul_i32 s60, 0x00000100, s7
\n
"
" v_add_u32 v54, v54, s60
\n
"
" ds_write_b32 v54, v128 offset:18688
\n
"
" ds_write_b32 v54, v129 offset:26880
\n
"
" ds_write_b32 v54, v130 offset:19712
\n
"
" ds_write_b32 v54, v131 offset:27904
\n
"
" ds_write_b32 v54, v132 offset:20736
\n
"
" ds_write_b32 v54, v133 offset:28928
\n
"
" ds_write_b32 v54, v134 offset:21760
\n
"
" ds_write_b32 v54, v135 offset:29952
\n
"
" ds_write_b32 v54, v136 offset:22784
\n
"
" ds_write_b32 v54, v137 offset:30976
\n
"
" ds_write_b32 v54, v138 offset:23808
\n
"
" ds_write_b32 v54, v139 offset:32000
\n
"
" ds_write_b32 v54, v140 offset:24832
\n
"
" ds_write_b32 v54, v141 offset:33024
\n
"
" ds_write_b32 v54, v142 offset:25856
\n
"
" ds_write_b32 v54, v143 offset:34048
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" v_lshrrev_b32 v54, 4, v0
\n
"
" v_lshlrev_b32 v55, 6, v54
\n
"
" v_and_b32 v54, 15, v0
\n
"
" v_lshlrev_b32 v54, 1, v54
\n
"
" v_add_u32 v55, v54, v55
\n
"
" v_lshlrev_b32 v54, 2, v55
\n
"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
include/ck_tile/ops/flatmm/block/uk/flatmm_sn_uk_gfx9_32x256x512_1x4x1_16x16x32_int8_2.inc
0 → 100644
View file @
a759277d
#ifndef CK_TILE_FLATMM_UK_MFMA
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
#endif
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_INT8
# define _UK_MFMA_ "v_mfma_i32_16x16x32_i8"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cmp_u_f32 s[36:37], "
x0_
", "
x0_
"
\n
"
\
" v_add3_u32 v50, "
x0_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_cmp_u_f32 s[36:37], "
x1_
", "
x1_
"
\n
"
\
" v_add3_u32 v50, "
x1_
", %[v_nan_lo], 1
\n
"
\
" v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37]
\n
"
\
" v_perm_b32 "
y_
", v55, v54, s52
\n
"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
# define _UK_PK_CVT_(x0_, x1_, y_) \
" v_cvt_f16_f32 v54, "
x0_
"
\n
"
\
" v_cvt_f16_f32 v55, "
x1_
"
\n
"
\
" v_pack_b32_f16 "
y_
", v54, v55
\n
"
# define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
#endif
" ds_read_b64 v[128:129], v54 offset:18688
\n
"
" ds_read_b64 v[130:131], v54 offset:18816
\n
"
" ds_read_b64 v[132:133], v54 offset:19712
\n
"
" ds_read_b64 v[134:135], v54 offset:19840
\n
"
" ds_read_b64 v[136:137], v54 offset:20736
\n
"
" ds_read_b64 v[138:139], v54 offset:20864
\n
"
" ds_read_b64 v[140:141], v54 offset:21760
\n
"
" ds_read_b64 v[142:143], v54 offset:21888
\n
"
" ds_read_b64 v[144:145], v54 offset:22784
\n
"
" ds_read_b64 v[146:147], v54 offset:22912
\n
"
" ds_read_b64 v[148:149], v54 offset:23808
\n
"
" ds_read_b64 v[150:151], v54 offset:23936
\n
"
" ds_read_b64 v[152:153], v54 offset:24832
\n
"
" ds_read_b64 v[154:155], v54 offset:24960
\n
"
" ds_read_b64 v[156:157], v54 offset:25856
\n
"
" ds_read_b64 v[158:159], v54 offset:25984
\n
"
" ds_read_b64 v[160:161], v54 offset:26880
\n
"
" ds_read_b64 v[162:163], v54 offset:27008
\n
"
" ds_read_b64 v[164:165], v54 offset:27904
\n
"
" ds_read_b64 v[166:167], v54 offset:28032
\n
"
" ds_read_b64 v[168:169], v54 offset:28928
\n
"
" ds_read_b64 v[170:171], v54 offset:29056
\n
"
" ds_read_b64 v[172:173], v54 offset:29952
\n
"
" ds_read_b64 v[174:175], v54 offset:30080
\n
"
" ds_read_b64 v[176:177], v54 offset:30976
\n
"
" ds_read_b64 v[178:179], v54 offset:31104
\n
"
" ds_read_b64 v[180:181], v54 offset:32000
\n
"
" ds_read_b64 v[182:183], v54 offset:32128
\n
"
" ds_read_b64 v[184:185], v54 offset:33024
\n
"
" ds_read_b64 v[186:187], v54 offset:33152
\n
"
" ds_read_b64 v[188:189], v54 offset:34048
\n
"
" ds_read_b64 v[190:191], v54 offset:34176
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s16, %[s_tile_os_dq], s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" s_mov_b32 s80, 0
\n
"
" s_waitcnt 0x0000
\n
"
"label_0C3C:
\n
"
" s_waitcnt vmcnt(41)
\n
"
" s_barrier
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[0:1], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[2:3], v[130:131], v[192:195]
\n
"
" buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[4:5], v[132:133], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[6:7], v[134:135], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[8:9], v[136:137], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[10:11], v[138:139], v[192:195]
\n
"
" buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[12:13], v[140:141], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[14:15], v[142:143], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[0:1], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[2:3], v[162:163], v[196:199]
\n
"
" buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[4:5], v[164:165], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[6:7], v[166:167], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[8:9], v[168:169], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[10:11], v[170:171], v[196:199]
\n
"
" buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[12:13], v[172:173], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[14:15], v[174:175], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[16:17], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[18:19], v[130:131], v[200:203]
\n
"
" buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[20:21], v[132:133], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[22:23], v[134:135], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[24:25], v[136:137], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[26:27], v[138:139], v[200:203]
\n
"
" buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[28:29], v[140:141], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[30:31], v[142:143], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[16:17], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[18:19], v[162:163], v[204:207]
\n
"
" buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[20:21], v[164:165], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[22:23], v[166:167], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[24:25], v[168:169], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[26:27], v[170:171], v[204:207]
\n
"
" buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[28:29], v[172:173], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[30:31], v[174:175], v[204:207]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[32:33], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[34:35], v[130:131], v[208:211]
\n
"
" buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[36:37], v[132:133], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[38:39], v[134:135], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[40:41], v[136:137], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[42:43], v[138:139], v[208:211]
\n
"
" buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[44:45], v[140:141], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[46:47], v[142:143], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[32:33], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[34:35], v[162:163], v[212:215]
\n
"
" buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[36:37], v[164:165], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[38:39], v[166:167], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[40:41], v[168:169], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[42:43], v[170:171], v[212:215]
\n
"
" buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[44:45], v[172:173], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[46:47], v[174:175], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[48:49], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[50:51], v[130:131], v[216:219]
\n
"
" buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[52:53], v[132:133], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[54:55], v[134:135], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[56:57], v[136:137], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[58:59], v[138:139], v[216:219]
\n
"
" buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[60:61], v[140:141], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[62:63], v[142:143], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[48:49], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[50:51], v[162:163], v[220:223]
\n
"
" buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[52:53], v[164:165], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[54:55], v[166:167], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[56:57], v[168:169], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[58:59], v[170:171], v[220:223]
\n
"
" buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" s_add_u32 s12, %[s_tile_os_b_half], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[60:61], v[172:173], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[62:63], v[174:175], v[220:223]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[64:65], v[144:145], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[66:67], v[146:147], v[192:195]
\n
"
" buffer_load_dwordx4 acc[192:195], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[68:69], v[148:149], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[70:71], v[150:151], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[72:73], v[152:153], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[74:75], v[154:155], v[192:195]
\n
"
" buffer_load_dwordx4 acc[196:199], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[76:77], v[156:157], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[192:195], acc[78:79], v[158:159], v[192:195]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[64:65], v[176:177], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[66:67], v[178:179], v[196:199]
\n
"
" buffer_load_dwordx4 acc[200:203], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[68:69], v[180:181], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[70:71], v[182:183], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[72:73], v[184:185], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[74:75], v[186:187], v[196:199]
\n
"
" buffer_load_dwordx4 acc[204:207], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[76:77], v[188:189], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[196:199], acc[78:79], v[190:191], v[196:199]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[80:81], v[144:145], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[82:83], v[146:147], v[200:203]
\n
"
" buffer_load_dwordx4 acc[208:211], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[84:85], v[148:149], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[86:87], v[150:151], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[88:89], v[152:153], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[90:91], v[154:155], v[200:203]
\n
"
" buffer_load_dwordx4 acc[212:215], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[92:93], v[156:157], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[200:203], acc[94:95], v[158:159], v[200:203]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[80:81], v[176:177], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[82:83], v[178:179], v[204:207]
\n
"
" buffer_load_dwordx4 acc[216:219], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[84:85], v[180:181], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[86:87], v[182:183], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[88:89], v[184:185], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[90:91], v[186:187], v[204:207]
\n
"
" buffer_load_dwordx4 acc[220:223], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[92:93], v[188:189], v[204:207]
\n
"
" v_mfma_i32_16x16x32_i8 v[204:207], acc[94:95], v[190:191], v[204:207]
\n
"
" s_waitcnt vmcnt(40)
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[96:97], v[144:145], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[98:99], v[146:147], v[208:211]
\n
"
" buffer_load_dwordx4 acc[224:227], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[100:101], v[148:149], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[102:103], v[150:151], v[208:211]
\n
"
" buffer_load_dword v13, v5, s[16:19], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[104:105], v[152:153], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[106:107], v[154:155], v[208:211]
\n
"
" buffer_load_dwordx4 acc[228:231], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[108:109], v[156:157], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[208:211], acc[110:111], v[158:159], v[208:211]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[96:97], v[176:177], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[98:99], v[178:179], v[212:215]
\n
"
" buffer_load_dwordx4 acc[232:235], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[100:101], v[180:181], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[102:103], v[182:183], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[104:105], v[184:185], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[106:107], v[186:187], v[212:215]
\n
"
" buffer_load_dwordx4 acc[236:239], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[108:109], v[188:189], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[212:215], acc[110:111], v[190:191], v[212:215]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[112:113], v[144:145], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[114:115], v[146:147], v[216:219]
\n
"
" buffer_load_dwordx4 acc[240:243], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[116:117], v[148:149], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[118:119], v[150:151], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[120:121], v[152:153], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[122:123], v[154:155], v[216:219]
\n
"
" buffer_load_dwordx4 acc[244:247], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[124:125], v[156:157], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[216:219], acc[126:127], v[158:159], v[216:219]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[112:113], v[176:177], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[114:115], v[178:179], v[220:223]
\n
"
" buffer_load_dwordx4 acc[248:251], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[116:117], v[180:181], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[118:119], v[182:183], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[120:121], v[184:185], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[122:123], v[186:187], v[220:223]
\n
"
" buffer_load_dwordx4 acc[252:255], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[124:125], v[188:189], v[220:223]
\n
"
" v_mfma_i32_16x16x32_i8 v[220:223], acc[126:127], v[190:191], v[220:223]
\n
"
" s_add_u32 s60, 0x00000200, s80
\n
"
" s_cmp_lt_u32 s60, s81
\n
"
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0
\n
"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0
\n
"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s16, %[s_tile_os_dq], s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" v_cvt_f32_i32 v192, v192
\n
"
" v_cvt_f32_i32 v193, v193
\n
"
" v_cvt_f32_i32 v194, v194
\n
"
" v_cvt_f32_i32 v195, v195
\n
"
" v_mul_f32 v192, v24, v192
\n
"
" v_mul_f32 v193, v24, v193
\n
"
" v_mul_f32 v194, v24, v194
\n
"
" v_mul_f32 v195, v24, v195
\n
"
" v_mul_f32 v192, v12, v192 row_newbcast:0
\n
"
" v_mul_f32 v193, v12, v193 row_newbcast:1
\n
"
" v_mul_f32 v194, v12, v194 row_newbcast:2
\n
"
" v_mul_f32 v195, v12, v195 row_newbcast:3
\n
"
" v_mul_f32 v192, v20, v192
\n
"
" v_mul_f32 v193, v20, v193
\n
"
" v_mul_f32 v194, v20, v194
\n
"
" v_mul_f32 v195, v20, v195
\n
"
" v_cvt_f32_i32 v196, v196
\n
"
" v_cvt_f32_i32 v197, v197
\n
"
" v_cvt_f32_i32 v198, v198
\n
"
" v_cvt_f32_i32 v199, v199
\n
"
" v_mul_f32 v196, v25, v196
\n
"
" v_mul_f32 v197, v25, v197
\n
"
" v_mul_f32 v198, v25, v198
\n
"
" v_mul_f32 v199, v25, v199
\n
"
" v_mul_f32 v196, v12, v196 row_newbcast:0
\n
"
" v_mul_f32 v197, v12, v197 row_newbcast:1
\n
"
" v_mul_f32 v198, v12, v198 row_newbcast:2
\n
"
" v_mul_f32 v199, v12, v199 row_newbcast:3
\n
"
" v_mul_f32 v196, v21, v196
\n
"
" v_mul_f32 v197, v21, v197
\n
"
" v_mul_f32 v198, v21, v198
\n
"
" v_mul_f32 v199, v21, v199
\n
"
" v_cvt_f32_i32 v200, v200
\n
"
" v_cvt_f32_i32 v201, v201
\n
"
" v_cvt_f32_i32 v202, v202
\n
"
" v_cvt_f32_i32 v203, v203
\n
"
" v_mul_f32 v200, v24, v200
\n
"
" v_mul_f32 v201, v24, v201
\n
"
" v_mul_f32 v202, v24, v202
\n
"
" v_mul_f32 v203, v24, v203
\n
"
" v_mul_f32 v200, v12, v200 row_newbcast:4
\n
"
" v_mul_f32 v201, v12, v201 row_newbcast:5
\n
"
" v_mul_f32 v202, v12, v202 row_newbcast:6
\n
"
" v_mul_f32 v203, v12, v203 row_newbcast:7
\n
"
" v_mul_f32 v200, v20, v200
\n
"
" v_mul_f32 v201, v20, v201
\n
"
" v_mul_f32 v202, v20, v202
\n
"
" v_mul_f32 v203, v20, v203
\n
"
" v_cvt_f32_i32 v204, v204
\n
"
" v_cvt_f32_i32 v205, v205
\n
"
" v_cvt_f32_i32 v206, v206
\n
"
" v_cvt_f32_i32 v207, v207
\n
"
" v_mul_f32 v204, v25, v204
\n
"
" v_mul_f32 v205, v25, v205
\n
"
" v_mul_f32 v206, v25, v206
\n
"
" v_mul_f32 v207, v25, v207
\n
"
" v_mul_f32 v204, v12, v204 row_newbcast:4
\n
"
" v_mul_f32 v205, v12, v205 row_newbcast:5
\n
"
" v_mul_f32 v206, v12, v206 row_newbcast:6
\n
"
" v_mul_f32 v207, v12, v207 row_newbcast:7
\n
"
" v_mul_f32 v204, v21, v204
\n
"
" v_mul_f32 v205, v21, v205
\n
"
" v_mul_f32 v206, v21, v206
\n
"
" v_mul_f32 v207, v21, v207
\n
"
" v_cvt_f32_i32 v208, v208
\n
"
" v_cvt_f32_i32 v209, v209
\n
"
" v_cvt_f32_i32 v210, v210
\n
"
" v_cvt_f32_i32 v211, v211
\n
"
" v_mul_f32 v208, v24, v208
\n
"
" v_mul_f32 v209, v24, v209
\n
"
" v_mul_f32 v210, v24, v210
\n
"
" v_mul_f32 v211, v24, v211
\n
"
" v_mul_f32 v208, v12, v208 row_newbcast:8
\n
"
" v_mul_f32 v209, v12, v209 row_newbcast:9
\n
"
" v_mul_f32 v210, v12, v210 row_newbcast:10
\n
"
" v_mul_f32 v211, v12, v211 row_newbcast:11
\n
"
" v_mul_f32 v208, v20, v208
\n
"
" v_mul_f32 v209, v20, v209
\n
"
" v_mul_f32 v210, v20, v210
\n
"
" v_mul_f32 v211, v20, v211
\n
"
" v_cvt_f32_i32 v212, v212
\n
"
" v_cvt_f32_i32 v213, v213
\n
"
" v_cvt_f32_i32 v214, v214
\n
"
" v_cvt_f32_i32 v215, v215
\n
"
" v_mul_f32 v212, v25, v212
\n
"
" v_mul_f32 v213, v25, v213
\n
"
" v_mul_f32 v214, v25, v214
\n
"
" v_mul_f32 v215, v25, v215
\n
"
" v_mul_f32 v212, v12, v212 row_newbcast:8
\n
"
" v_mul_f32 v213, v12, v213 row_newbcast:9
\n
"
" v_mul_f32 v214, v12, v214 row_newbcast:10
\n
"
" v_mul_f32 v215, v12, v215 row_newbcast:11
\n
"
" v_mul_f32 v212, v21, v212
\n
"
" v_mul_f32 v213, v21, v213
\n
"
" v_mul_f32 v214, v21, v214
\n
"
" v_mul_f32 v215, v21, v215
\n
"
" v_cvt_f32_i32 v216, v216
\n
"
" v_cvt_f32_i32 v217, v217
\n
"
" v_cvt_f32_i32 v218, v218
\n
"
" v_cvt_f32_i32 v219, v219
\n
"
" v_mul_f32 v216, v24, v216
\n
"
" v_mul_f32 v217, v24, v217
\n
"
" v_mul_f32 v218, v24, v218
\n
"
" v_mul_f32 v219, v24, v219
\n
"
" v_mul_f32 v216, v12, v216 row_newbcast:12
\n
"
" v_mul_f32 v217, v12, v217 row_newbcast:13
\n
"
" v_mul_f32 v218, v12, v218 row_newbcast:14
\n
"
" v_mul_f32 v219, v12, v219 row_newbcast:15
\n
"
" v_mul_f32 v216, v20, v216
\n
"
" v_mul_f32 v217, v20, v217
\n
"
" v_mul_f32 v218, v20, v218
\n
"
" v_mul_f32 v219, v20, v219
\n
"
" v_cvt_f32_i32 v220, v220
\n
"
" v_cvt_f32_i32 v221, v221
\n
"
" v_cvt_f32_i32 v222, v222
\n
"
" v_cvt_f32_i32 v223, v223
\n
"
" v_mul_f32 v220, v25, v220
\n
"
" v_mul_f32 v221, v25, v221
\n
"
" v_mul_f32 v222, v25, v222
\n
"
" v_mul_f32 v223, v25, v223
\n
"
" v_mul_f32 v220, v12, v220 row_newbcast:12
\n
"
" v_mul_f32 v221, v12, v221 row_newbcast:13
\n
"
" v_mul_f32 v222, v12, v222 row_newbcast:14
\n
"
" v_mul_f32 v223, v12, v223 row_newbcast:15
\n
"
" v_mul_f32 v220, v21, v220
\n
"
" v_mul_f32 v221, v21, v221
\n
"
" v_mul_f32 v222, v21, v222
\n
"
" v_mul_f32 v223, v21, v223
\n
"
" v_cmp_u_f32 s[48:49], v192, v192
\n
"
" v_add3_u32 v50, v192, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v193, v193
\n
"
" v_add3_u32 v50, v193, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v192, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v194, v194
\n
"
" v_add3_u32 v50, v194, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v195, v195
\n
"
" v_add3_u32 v50, v195, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v193, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v196, v196
\n
"
" v_add3_u32 v50, v196, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v197, v197
\n
"
" v_add3_u32 v50, v197, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v194, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v198, v198
\n
"
" v_add3_u32 v50, v198, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v199, v199
\n
"
" v_add3_u32 v50, v199, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v195, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v200, v200
\n
"
" v_add3_u32 v50, v200, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v201, v201
\n
"
" v_add3_u32 v50, v201, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v196, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v202, v202
\n
"
" v_add3_u32 v50, v202, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v203, v203
\n
"
" v_add3_u32 v50, v203, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v197, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v204, v204
\n
"
" v_add3_u32 v50, v204, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v205, v205
\n
"
" v_add3_u32 v50, v205, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v198, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v206, v206
\n
"
" v_add3_u32 v50, v206, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v207, v207
\n
"
" v_add3_u32 v50, v207, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v199, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v208, v208
\n
"
" v_add3_u32 v50, v208, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v209, v209
\n
"
" v_add3_u32 v50, v209, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v200, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v210, v210
\n
"
" v_add3_u32 v50, v210, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v211, v211
\n
"
" v_add3_u32 v50, v211, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v201, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v212, v212
\n
"
" v_add3_u32 v50, v212, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v213, v213
\n
"
" v_add3_u32 v50, v213, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v202, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v214, v214
\n
"
" v_add3_u32 v50, v214, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v215, v215
\n
"
" v_add3_u32 v50, v215, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v203, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v216, v216
\n
"
" v_add3_u32 v50, v216, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v217, v217
\n
"
" v_add3_u32 v50, v217, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v204, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v218, v218
\n
"
" v_add3_u32 v50, v218, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v219, v219
\n
"
" v_add3_u32 v50, v219, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v205, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v220, v220
\n
"
" v_add3_u32 v50, v220, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v221, v221
\n
"
" v_add3_u32 v50, v221, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v206, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v222, v222
\n
"
" v_add3_u32 v50, v222, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v223, v223
\n
"
" v_add3_u32 v50, v223, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v207, v55, v54, s52
\n
"
" ds_write_b64 v3, v[192:193] offset:35072
\n
"
" ds_write_b64 v3, v[194:195] offset:43776
\n
"
" ds_write_b64 v3, v[196:197] offset:37248
\n
"
" ds_write_b64 v3, v[198:199] offset:45952
\n
"
" ds_write_b64 v3, v[200:201] offset:39424
\n
"
" ds_write_b64 v3, v[202:203] offset:48128
\n
"
" ds_write_b64 v3, v[204:205] offset:41600
\n
"
" ds_write_b64 v3, v[206:207] offset:50304
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v64, v4 offset:35072
\n
"
" ds_read_b32 v65, v4 offset:39424
\n
"
" ds_read_b32 v66, v4 offset:35104
\n
"
" ds_read_b32 v67, v4 offset:39456
\n
"
" ds_read_b32 v68, v4 offset:35136
\n
"
" ds_read_b32 v69, v4 offset:39488
\n
"
" ds_read_b32 v70, v4 offset:35168
\n
"
" ds_read_b32 v71, v4 offset:39520
\n
"
" ds_read_b32 v72, v4 offset:43776
\n
"
" ds_read_b32 v73, v4 offset:48128
\n
"
" ds_read_b32 v74, v4 offset:43808
\n
"
" ds_read_b32 v75, v4 offset:48160
\n
"
" ds_read_b32 v76, v4 offset:43840
\n
"
" ds_read_b32 v77, v4 offset:48192
\n
"
" ds_read_b32 v78, v4 offset:43872
\n
"
" ds_read_b32 v79, v4 offset:48224
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16 v80, v64, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16 v80, v65, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16 v82, v66, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16 v82, v67, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16 v84, v68, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16 v84, v69, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16 v86, v70, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16 v86, v71, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16 v88, v72, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16 v88, v73, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16 v90, v74, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16 v90, v75, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16 v92, v76, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16 v92, v77, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16 v94, v78, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16 v94, v79, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_addk_i32 s80, 0x0100
\n
"
" s_cmp_lt_i32 s80, s81
\n
"
" s_cbranch_scc0 label_2301
\n
"
" s_waitcnt vmcnt(41)
\n
"
" s_barrier
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[128:129], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[130:131], v[130:131], v[224:227]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[132:133], v[132:133], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[134:135], v[134:135], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[136:137], v[136:137], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[138:139], v[138:139], v[224:227]
\n
"
" buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[140:141], v[140:141], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[142:143], v[142:143], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[128:129], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[130:131], v[162:163], v[228:231]
\n
"
" buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[132:133], v[164:165], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[134:135], v[166:167], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[136:137], v[168:169], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[138:139], v[170:171], v[228:231]
\n
"
" buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[140:141], v[172:173], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[142:143], v[174:175], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[144:145], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[146:147], v[130:131], v[232:235]
\n
"
" buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[148:149], v[132:133], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[150:151], v[134:135], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[152:153], v[136:137], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[154:155], v[138:139], v[232:235]
\n
"
" buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[156:157], v[140:141], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[158:159], v[142:143], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[144:145], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[146:147], v[162:163], v[236:239]
\n
"
" buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[148:149], v[164:165], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[150:151], v[166:167], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[152:153], v[168:169], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[154:155], v[170:171], v[236:239]
\n
"
" buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[156:157], v[172:173], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[158:159], v[174:175], v[236:239]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[160:161], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[162:163], v[130:131], v[240:243]
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[164:165], v[132:133], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[166:167], v[134:135], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[168:169], v[136:137], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[170:171], v[138:139], v[240:243]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[172:173], v[140:141], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[174:175], v[142:143], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[160:161], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[162:163], v[162:163], v[244:247]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[164:165], v[164:165], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[166:167], v[166:167], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[168:169], v[168:169], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[170:171], v[170:171], v[244:247]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[172:173], v[172:173], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[174:175], v[174:175], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[176:177], v[128:129], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[178:179], v[130:131], v[248:251]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[180:181], v[132:133], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[182:183], v[134:135], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[184:185], v[136:137], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[186:187], v[138:139], v[248:251]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[188:189], v[140:141], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[190:191], v[142:143], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[176:177], v[160:161], 0
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[178:179], v[162:163], v[252:255]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[180:181], v[164:165], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[182:183], v[166:167], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[184:185], v[168:169], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[186:187], v[170:171], v[252:255]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" s_add_u32 s12, %[s_tile_os_b_half], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[188:189], v[172:173], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[190:191], v[174:175], v[252:255]
\n
"
" s_waitcnt vmcnt(41)
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[192:193], v[144:145], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[194:195], v[146:147], v[224:227]
\n
"
" buffer_load_dwordx4 acc[64:67], %[v_os_b0], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[196:197], v[148:149], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[198:199], v[150:151], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[200:201], v[152:153], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[202:203], v[154:155], v[224:227]
\n
"
" buffer_load_dwordx4 acc[68:71], %[v_os_b0], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[204:205], v[156:157], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[224:227], acc[206:207], v[158:159], v[224:227]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[192:193], v[176:177], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[194:195], v[178:179], v[228:231]
\n
"
" buffer_load_dwordx4 acc[72:75], %[v_os_b0], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[196:197], v[180:181], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[198:199], v[182:183], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[200:201], v[184:185], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[202:203], v[186:187], v[228:231]
\n
"
" buffer_load_dwordx4 acc[76:79], %[v_os_b0], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[204:205], v[188:189], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[228:231], acc[206:207], v[190:191], v[228:231]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[208:209], v[144:145], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[210:211], v[146:147], v[232:235]
\n
"
" buffer_load_dwordx4 acc[80:83], %[v_os_b1], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[212:213], v[148:149], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[214:215], v[150:151], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[216:217], v[152:153], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[218:219], v[154:155], v[232:235]
\n
"
" buffer_load_dwordx4 acc[84:87], %[v_os_b1], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[220:221], v[156:157], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[232:235], acc[222:223], v[158:159], v[232:235]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[208:209], v[176:177], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[210:211], v[178:179], v[236:239]
\n
"
" buffer_load_dwordx4 acc[88:91], %[v_os_b1], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[212:213], v[180:181], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[214:215], v[182:183], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[216:217], v[184:185], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[218:219], v[186:187], v[236:239]
\n
"
" buffer_load_dwordx4 acc[92:95], %[v_os_b1], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[220:221], v[188:189], v[236:239]
\n
"
" v_mfma_i32_16x16x32_i8 v[236:239], acc[222:223], v[190:191], v[236:239]
\n
"
" s_waitcnt vmcnt(40)
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[224:225], v[144:145], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[226:227], v[146:147], v[240:243]
\n
"
" buffer_load_dwordx4 acc[96:99], %[v_os_b2], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[228:229], v[148:149], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[230:231], v[150:151], v[240:243]
\n
"
" buffer_load_dword v12, v5, s[16:19], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[232:233], v[152:153], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[234:235], v[154:155], v[240:243]
\n
"
" buffer_load_dwordx4 acc[100:103], %[v_os_b2], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[236:237], v[156:157], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[240:243], acc[238:239], v[158:159], v[240:243]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[224:225], v[176:177], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[226:227], v[178:179], v[244:247]
\n
"
" buffer_load_dwordx4 acc[104:107], %[v_os_b2], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[228:229], v[180:181], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[230:231], v[182:183], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[232:233], v[184:185], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[234:235], v[186:187], v[244:247]
\n
"
" buffer_load_dwordx4 acc[108:111], %[v_os_b2], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[236:237], v[188:189], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[244:247], acc[238:239], v[190:191], v[244:247]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[240:241], v[144:145], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[242:243], v[146:147], v[248:251]
\n
"
" buffer_load_dwordx4 acc[112:115], %[v_os_b3], s[12:15], 0 offen
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[244:245], v[148:149], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[246:247], v[150:151], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[248:249], v[152:153], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[250:251], v[154:155], v[248:251]
\n
"
" buffer_load_dwordx4 acc[116:119], %[v_os_b3], s[12:15], 0 offen offset:1024
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[252:253], v[156:157], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[248:251], acc[254:255], v[158:159], v[248:251]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[240:241], v[176:177], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[242:243], v[178:179], v[252:255]
\n
"
" buffer_load_dwordx4 acc[120:123], %[v_os_b3], s[12:15], 0 offen offset:2048
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[244:245], v[180:181], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[246:247], v[182:183], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[248:249], v[184:185], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[250:251], v[186:187], v[252:255]
\n
"
" buffer_load_dwordx4 acc[124:127], %[v_os_b3], s[12:15], 0 offen offset:3072
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[252:253], v[188:189], v[252:255]
\n
"
" v_mfma_i32_16x16x32_i8 v[252:255], acc[254:255], v[190:191], v[252:255]
\n
"
" s_add_u32 s60, 0x00000200, s80
\n
"
" s_cmp_lt_u32 s60, s81
\n
"
" s_cselect_b32 %[s_tile_os_b], %[s_tile_os_b], 0
\n
"
" s_cselect_b32 %[s_tile_os_b_half], %[s_tile_os_b_half], 0
\n
"
" s_cselect_b32 %[s_tile_os_dq], %[s_tile_os_dq], 0
\n
"
" s_add_u32 s12, %[s_tile_os_b], s12
\n
"
" s_addc_u32 s13, 0, s13
\n
"
" s_add_u32 s16, %[s_tile_os_dq], s16
\n
"
" s_addc_u32 s17, 0, s17
\n
"
" v_cvt_f32_i32 v224, v224
\n
"
" v_cvt_f32_i32 v225, v225
\n
"
" v_cvt_f32_i32 v226, v226
\n
"
" v_cvt_f32_i32 v227, v227
\n
"
" v_mul_f32 v224, v24, v224
\n
"
" v_mul_f32 v225, v24, v225
\n
"
" v_mul_f32 v226, v24, v226
\n
"
" v_mul_f32 v227, v24, v227
\n
"
" v_mul_f32 v224, v13, v224 row_newbcast:0
\n
"
" v_mul_f32 v225, v13, v225 row_newbcast:1
\n
"
" v_mul_f32 v226, v13, v226 row_newbcast:2
\n
"
" v_mul_f32 v227, v13, v227 row_newbcast:3
\n
"
" v_mul_f32 v224, v20, v224
\n
"
" v_mul_f32 v225, v20, v225
\n
"
" v_mul_f32 v226, v20, v226
\n
"
" v_mul_f32 v227, v20, v227
\n
"
" v_cvt_f32_i32 v228, v228
\n
"
" v_cvt_f32_i32 v229, v229
\n
"
" v_cvt_f32_i32 v230, v230
\n
"
" v_cvt_f32_i32 v231, v231
\n
"
" v_mul_f32 v228, v25, v228
\n
"
" v_mul_f32 v229, v25, v229
\n
"
" v_mul_f32 v230, v25, v230
\n
"
" v_mul_f32 v231, v25, v231
\n
"
" v_mul_f32 v228, v13, v228 row_newbcast:0
\n
"
" v_mul_f32 v229, v13, v229 row_newbcast:1
\n
"
" v_mul_f32 v230, v13, v230 row_newbcast:2
\n
"
" v_mul_f32 v231, v13, v231 row_newbcast:3
\n
"
" v_mul_f32 v228, v21, v228
\n
"
" v_mul_f32 v229, v21, v229
\n
"
" v_mul_f32 v230, v21, v230
\n
"
" v_mul_f32 v231, v21, v231
\n
"
" v_cvt_f32_i32 v232, v232
\n
"
" v_cvt_f32_i32 v233, v233
\n
"
" v_cvt_f32_i32 v234, v234
\n
"
" v_cvt_f32_i32 v235, v235
\n
"
" v_mul_f32 v232, v24, v232
\n
"
" v_mul_f32 v233, v24, v233
\n
"
" v_mul_f32 v234, v24, v234
\n
"
" v_mul_f32 v235, v24, v235
\n
"
" v_mul_f32 v232, v13, v232 row_newbcast:4
\n
"
" v_mul_f32 v233, v13, v233 row_newbcast:5
\n
"
" v_mul_f32 v234, v13, v234 row_newbcast:6
\n
"
" v_mul_f32 v235, v13, v235 row_newbcast:7
\n
"
" v_mul_f32 v232, v20, v232
\n
"
" v_mul_f32 v233, v20, v233
\n
"
" v_mul_f32 v234, v20, v234
\n
"
" v_mul_f32 v235, v20, v235
\n
"
" v_cvt_f32_i32 v236, v236
\n
"
" v_cvt_f32_i32 v237, v237
\n
"
" v_cvt_f32_i32 v238, v238
\n
"
" v_cvt_f32_i32 v239, v239
\n
"
" v_mul_f32 v236, v25, v236
\n
"
" v_mul_f32 v237, v25, v237
\n
"
" v_mul_f32 v238, v25, v238
\n
"
" v_mul_f32 v239, v25, v239
\n
"
" v_mul_f32 v236, v13, v236 row_newbcast:4
\n
"
" v_mul_f32 v237, v13, v237 row_newbcast:5
\n
"
" v_mul_f32 v238, v13, v238 row_newbcast:6
\n
"
" v_mul_f32 v239, v13, v239 row_newbcast:7
\n
"
" v_mul_f32 v236, v21, v236
\n
"
" v_mul_f32 v237, v21, v237
\n
"
" v_mul_f32 v238, v21, v238
\n
"
" v_mul_f32 v239, v21, v239
\n
"
" v_cvt_f32_i32 v240, v240
\n
"
" v_cvt_f32_i32 v241, v241
\n
"
" v_cvt_f32_i32 v242, v242
\n
"
" v_cvt_f32_i32 v243, v243
\n
"
" v_mul_f32 v240, v24, v240
\n
"
" v_mul_f32 v241, v24, v241
\n
"
" v_mul_f32 v242, v24, v242
\n
"
" v_mul_f32 v243, v24, v243
\n
"
" v_mul_f32 v240, v13, v240 row_newbcast:8
\n
"
" v_mul_f32 v241, v13, v241 row_newbcast:9
\n
"
" v_mul_f32 v242, v13, v242 row_newbcast:10
\n
"
" v_mul_f32 v243, v13, v243 row_newbcast:11
\n
"
" v_mul_f32 v240, v20, v240
\n
"
" v_mul_f32 v241, v20, v241
\n
"
" v_mul_f32 v242, v20, v242
\n
"
" v_mul_f32 v243, v20, v243
\n
"
" v_cvt_f32_i32 v244, v244
\n
"
" v_cvt_f32_i32 v245, v245
\n
"
" v_cvt_f32_i32 v246, v246
\n
"
" v_cvt_f32_i32 v247, v247
\n
"
" v_mul_f32 v244, v25, v244
\n
"
" v_mul_f32 v245, v25, v245
\n
"
" v_mul_f32 v246, v25, v246
\n
"
" v_mul_f32 v247, v25, v247
\n
"
" v_mul_f32 v244, v13, v244 row_newbcast:8
\n
"
" v_mul_f32 v245, v13, v245 row_newbcast:9
\n
"
" v_mul_f32 v246, v13, v246 row_newbcast:10
\n
"
" v_mul_f32 v247, v13, v247 row_newbcast:11
\n
"
" v_mul_f32 v244, v21, v244
\n
"
" v_mul_f32 v245, v21, v245
\n
"
" v_mul_f32 v246, v21, v246
\n
"
" v_mul_f32 v247, v21, v247
\n
"
" v_cvt_f32_i32 v248, v248
\n
"
" v_cvt_f32_i32 v249, v249
\n
"
" v_cvt_f32_i32 v250, v250
\n
"
" v_cvt_f32_i32 v251, v251
\n
"
" v_mul_f32 v248, v24, v248
\n
"
" v_mul_f32 v249, v24, v249
\n
"
" v_mul_f32 v250, v24, v250
\n
"
" v_mul_f32 v251, v24, v251
\n
"
" v_mul_f32 v248, v13, v248 row_newbcast:12
\n
"
" v_mul_f32 v249, v13, v249 row_newbcast:13
\n
"
" v_mul_f32 v250, v13, v250 row_newbcast:14
\n
"
" v_mul_f32 v251, v13, v251 row_newbcast:15
\n
"
" v_mul_f32 v248, v20, v248
\n
"
" v_mul_f32 v249, v20, v249
\n
"
" v_mul_f32 v250, v20, v250
\n
"
" v_mul_f32 v251, v20, v251
\n
"
" v_cvt_f32_i32 v252, v252
\n
"
" v_cvt_f32_i32 v253, v253
\n
"
" v_cvt_f32_i32 v254, v254
\n
"
" v_cvt_f32_i32 v255, v255
\n
"
" v_mul_f32 v252, v25, v252
\n
"
" v_mul_f32 v253, v25, v253
\n
"
" v_mul_f32 v254, v25, v254
\n
"
" v_mul_f32 v255, v25, v255
\n
"
" v_mul_f32 v252, v13, v252 row_newbcast:12
\n
"
" v_mul_f32 v253, v13, v253 row_newbcast:13
\n
"
" v_mul_f32 v254, v13, v254 row_newbcast:14
\n
"
" v_mul_f32 v255, v13, v255 row_newbcast:15
\n
"
" v_mul_f32 v252, v21, v252
\n
"
" v_mul_f32 v253, v21, v253
\n
"
" v_mul_f32 v254, v21, v254
\n
"
" v_mul_f32 v255, v21, v255
\n
"
" v_cmp_u_f32 s[48:49], v224, v224
\n
"
" v_add3_u32 v50, v224, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v225, v225
\n
"
" v_add3_u32 v50, v225, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v224, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v226, v226
\n
"
" v_add3_u32 v50, v226, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v227, v227
\n
"
" v_add3_u32 v50, v227, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v225, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v228, v228
\n
"
" v_add3_u32 v50, v228, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v229, v229
\n
"
" v_add3_u32 v50, v229, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v226, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v230, v230
\n
"
" v_add3_u32 v50, v230, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v231, v231
\n
"
" v_add3_u32 v50, v231, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v227, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v232, v232
\n
"
" v_add3_u32 v50, v232, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v233, v233
\n
"
" v_add3_u32 v50, v233, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v228, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v234, v234
\n
"
" v_add3_u32 v50, v234, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v235, v235
\n
"
" v_add3_u32 v50, v235, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v229, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v236, v236
\n
"
" v_add3_u32 v50, v236, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v237, v237
\n
"
" v_add3_u32 v50, v237, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v230, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v238, v238
\n
"
" v_add3_u32 v50, v238, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v239, v239
\n
"
" v_add3_u32 v50, v239, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v231, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v240, v240
\n
"
" v_add3_u32 v50, v240, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v241, v241
\n
"
" v_add3_u32 v50, v241, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v232, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v242, v242
\n
"
" v_add3_u32 v50, v242, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v243, v243
\n
"
" v_add3_u32 v50, v243, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v233, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v244, v244
\n
"
" v_add3_u32 v50, v244, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v245, v245
\n
"
" v_add3_u32 v50, v245, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v234, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v246, v246
\n
"
" v_add3_u32 v50, v246, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v247, v247
\n
"
" v_add3_u32 v50, v247, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v235, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v248, v248
\n
"
" v_add3_u32 v50, v248, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v249, v249
\n
"
" v_add3_u32 v50, v249, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v236, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v250, v250
\n
"
" v_add3_u32 v50, v250, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v251, v251
\n
"
" v_add3_u32 v50, v251, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v237, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v252, v252
\n
"
" v_add3_u32 v50, v252, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v253, v253
\n
"
" v_add3_u32 v50, v253, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v238, v55, v54, s52
\n
"
" v_cmp_u_f32 s[48:49], v254, v254
\n
"
" v_add3_u32 v50, v254, v53, 1
\n
"
" v_cndmask_b32 v54, v50, v52, s[48:49]
\n
"
" v_cmp_u_f32 s[48:49], v255, v255
\n
"
" v_add3_u32 v50, v255, v53, 1
\n
"
" v_cndmask_b32 v55, v50, v52, s[48:49]
\n
"
" v_perm_b32 v239, v55, v54, s52
\n
"
" ds_write_b64 v3, v[224:225] offset:35072
\n
"
" ds_write_b64 v3, v[226:227] offset:43776
\n
"
" ds_write_b64 v3, v[228:229] offset:37248
\n
"
" ds_write_b64 v3, v[230:231] offset:45952
\n
"
" ds_write_b64 v3, v[232:233] offset:39424
\n
"
" ds_write_b64 v3, v[234:235] offset:48128
\n
"
" ds_write_b64 v3, v[236:237] offset:41600
\n
"
" ds_write_b64 v3, v[238:239] offset:50304
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_barrier
\n
"
" ds_read_b32 v64, v4 offset:35072
\n
"
" ds_read_b32 v65, v4 offset:39424
\n
"
" ds_read_b32 v66, v4 offset:35104
\n
"
" ds_read_b32 v67, v4 offset:39456
\n
"
" ds_read_b32 v68, v4 offset:35136
\n
"
" ds_read_b32 v69, v4 offset:39488
\n
"
" ds_read_b32 v70, v4 offset:35168
\n
"
" ds_read_b32 v71, v4 offset:39520
\n
"
" ds_read_b32 v72, v4 offset:43776
\n
"
" ds_read_b32 v73, v4 offset:48128
\n
"
" ds_read_b32 v74, v4 offset:43808
\n
"
" ds_read_b32 v75, v4 offset:48160
\n
"
" ds_read_b32 v76, v4 offset:43840
\n
"
" ds_read_b32 v77, v4 offset:48192
\n
"
" ds_read_b32 v78, v4 offset:43872
\n
"
" ds_read_b32 v79, v4 offset:48224
\n
"
" s_waitcnt lgkmcnt(0)
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16 v80, v64, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[20:21]
\n
"
" global_atomic_pk_add_bf16 v80, v65, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16 v82, v66, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[22:23]
\n
"
" global_atomic_pk_add_bf16 v82, v67, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16 v84, v68, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[24:25]
\n
"
" global_atomic_pk_add_bf16 v84, v69, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16 v86, v70, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[26:27]
\n
"
" global_atomic_pk_add_bf16 v86, v71, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16 v88, v72, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[28:29]
\n
"
" global_atomic_pk_add_bf16 v88, v73, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16 v90, v74, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[30:31]
\n
"
" global_atomic_pk_add_bf16 v90, v75, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16 v92, v76, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[32:33]
\n
"
" global_atomic_pk_add_bf16 v92, v77, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16 v94, v78, s[8:9]
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_mov_b64 exec, s[34:35]
\n
"
" global_atomic_pk_add_bf16 v94, v79, s[8:9] inst_offset:256
\n
"
" s_mov_b64 exec, s[36:37]
\n
"
" s_add_u32 s8, s59, s8
\n
"
" s_addc_u32 s9, 0, s9
\n
"
" s_addk_i32 s80, 0x0100
\n
"
" s_cmp_lt_i32 s80, s81
\n
"
" s_cbranch_scc0 label_2301
\n
"
" s_branch label_0C3C
\n
"
" label_2301:
\n
"
" s_waitcnt 0x0000
\n
"
" s_endpgm
\n
"
#undef _UK_MFMA_
#undef _UK_PK_CVT_
#undef _UK_ATOMIC_ADD_
include/ck_tile/ops/flatmm/block/uk/flatmm_uk_gfx9_32x512x256_1x1x1_16x16x32_int8.inc
View file @
a759277d
# define _DEQUAN_CVT_(a, b, c) \
#ifndef CK_TILE_FLATMM_UK_MFMA
" v_cvt_f32_i32 a[0], a[0]
\n
"
\
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_INT8
" v_cvt_f32_i32 a[1], a[1]
\n
"
\
#endif
" v_cvt_f32_i32 a[2], a[2]
\n
"
\
" v_cvt_f32_i32 a[3], a[3]
\n
"
\
" v_mul_f32 a[0], v15, a[0]
\n
"
\
" v_mul_f32 a[1], v15, a[1]
\n
"
\
" v_mul_f32 a[2], v15, a[2]
\n
"
\
" v_mul_f32 a[3], v15, a[3]
\n
"
\
" v_mul_f32 a[0], v17, a[0] row_newbcast:12
\n
"
\
" v_mul_f32 a[1], v17, a[1] row_newbcast:13
\n
"
\
" v_mul_f32 a[2], v17, a[2] row_newbcast:14
\n
"
\
" v_mul_f32 a[3], v17, a[3] row_newbcast:15
\n
"
\
#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_INT8
# define _UK_MFMA_ "v_mfma_i32_16x16x32_i8"
#endif
# define _DEQUAN_CVT_(a0,a1,a2,a3, b, c) \
" v_cvt_f32_i32 a0, a0
\n
"
\
" v_cvt_f32_i32 a1, a1
\n
"
\
" v_cvt_f32_i32 a2, a2
\n
"
\
" v_cvt_f32_i32 a3, a3
\n
"
\
" v_mul_f32 a0, v15, a0
\n
"
\
" v_mul_f32 a1, v15, a1
\n
"
\
" v_mul_f32 a2, v15, a2
\n
"
\
" v_mul_f32 a3, v15, a3
\n
"
\
" v_mul_f32 a0, v17, a0 row_newbcast:12
\n
"
\
" v_mul_f32 a1, v17, a1 row_newbcast:13
\n
"
\
" v_mul_f32 a2, v17, a2 row_newbcast:14
\n
"
\
" v_mul_f32 a3, v17, a3 row_newbcast:15
\n
"
\
";-------------------------------
\n
"
"s_mov_b32 s28, %[s_res_aq0]
\n
"
"s_mov_b32 s29, %[s_res_aq1]
\n
"
"s_mov_b32 s30, %[s_res_aq2]
\n
"
"s_mov_b32 s31, %[s_res_aq3]
\n
"
"s_mov_b32 s16, %[s_res_dq0]
\n
"
"s_mov_b32 s16, %[s_res_dq0]
\n
"
"s_mov_b32 s17, %[s_res_dq1]
\n
"
"s_mov_b32 s17, %[s_res_dq1]
\n
"
"s_mov_b32 s18, %[s_res_dq2]
\n
"
"s_mov_b32 s18, %[s_res_dq2]
\n
"
...
@@ -32,19 +43,7 @@
...
@@ -32,19 +43,7 @@
"s_mov_b32 s25, %[s_res_b1]
\n
"
"s_mov_b32 s25, %[s_res_b1]
\n
"
"s_mov_b32 s26, %[s_res_b2]
\n
"
"s_mov_b32 s26, %[s_res_b2]
\n
"
"s_mov_b32 s27, %[s_res_b3]
\n
"
"s_mov_b32 s27, %[s_res_b3]
\n
"
";----------------------------------------------
\n
"
//////////GQ/DQ/GsmQ_addr///////////////
//expert weight addr no need
// s_mul_i32 s60, s3, 32 // 00000000056C: 923CA003 s3 s_tg_idy
// s_mul_i32 s60, 4, s60 // 000000000570: 923C3C84
// s_add_u32 s40, s60, s40 // 000000000574: 8028283C s40 sw_ptr
// s_addc_u32 s41, 0, s41 // 000000000578: 82292980 s41 sw_ptr
// v_and_b32 v54, 15, v0 // 00000000057C: 266C008F
// v_lshlrev_b32 v8, 2, v54 // 000000000580: 24106C82 v8/9 w addr
// v_add_u32 v9, 64, v8 // 000000000584: 681210C0
//GQDQ addr function kkkkkkkkkkkkkk
" v_lshrrev_b32 v54, 4, v0
\n
"
" v_lshrrev_b32 v54, 4, v0
\n
"
" v_lshlrev_b32 v55, 2, v54
\n
"
" v_lshlrev_b32 v55, 2, v54
\n
"
" v_and_b32 v54, 15, v0
\n
"
" v_and_b32 v54, 15, v0
\n
"
...
@@ -55,21 +54,17 @@
...
@@ -55,21 +54,17 @@
" v_add_u32 v55, v54, v55
\n
"
" v_add_u32 v55, v54, v55
\n
"
" v_lshlrev_b32 v10, 2, v55
\n
"
" v_lshlrev_b32 v10, 2, v55
\n
"
" v_add_u32 v11, 0x00000400, v10
\n
"
" v_add_u32 v11, 0x00000400, v10
\n
"
" s_mul_i32 s60, %[s_wave_id], 16
\n
"
" s_mul_i32 s60, %[s_wave_id], 16
\n
"
" s_mul_i32 s60, s60, 4
\n
"
" s_mul_i32 s60, s60, 4
\n
"
" v_add_u32 v10, s60, v10
\n
"
" v_add_u32 v10, s60, v10
\n
"
" v_add_u32 v11, s60, v11
\n
"
" v_add_u32 v11, s60, v11
\n
"
" v_mov_b32 v5, v10
\n
"
" v_mov_b32 v5, v10
\n
"
";----------------------------------------------
\n
"
//////////////////////////////
" s_mov_b32 s57, 0x00000100
\n
"
" s_mov_b32 s57, 0x00000100
\n
"
" s_mov_b32 s58, 0x00001000
\n
"
" s_mov_b32 s58, 0x00001000
\n
"
" s_mov_b32 s79, 0x00000400
\n
"
" s_mov_b32 s79, 0x00000400
\n
"
" s_mov_b32 s59, 0x00000200
\n
"
" s_mov_b32 s59, 0x00000200
\n
"
////////
";----------------------------------------------
\n
"
//" s_mul_i32 s60, s70, 0x00000100 \n"
//" s_sub_u32 s56, s60, 0x00001000 \n"
///////////////
" s_mov_b32 s78, 0x00001000
\n
"
" s_mov_b32 s78, 0x00001000
\n
"
" s_mov_b32 s52, 0x07060302
\n
"
" s_mov_b32 s52, 0x07060302
\n
"
" s_mov_b32 s53, 0x00000400
\n
"
" s_mov_b32 s53, 0x00000400
\n
"
...
@@ -82,7 +77,7 @@
...
@@ -82,7 +77,7 @@
" v_mov_b32 v52, 0x7fff0000
\n
"
" v_mov_b32 v52, 0x7fff0000
\n
"
" v_mov_b32 v53, 0x00007fff
\n
"
" v_mov_b32 v53, 0x00007fff
\n
"
" s_waitcnt 0x0000
\n
"
" s_waitcnt 0x0000
\n
"
///XQ ADDR, fake token id
";----------------------------------------------
\n
"
" v_mov_b32 %[v_token_id], %[v_token_id]
\n
"
" v_mov_b32 %[v_token_id], %[v_token_id]
\n
"
" v_lshrrev_b32 v54, 24, %[v_token_id]
\n
"
" v_lshrrev_b32 v54, 24, %[v_token_id]
\n
"
" v_mul_i32_i24 v54, s66, v54
\n
"
" v_mul_i32_i24 v54, s66, v54
\n
"
...
@@ -104,8 +99,7 @@
...
@@ -104,8 +99,7 @@
" buffer_load_dword v21, v9, s[40:43], 0 offen
\n
"
" buffer_load_dword v21, v9, s[40:43], 0 offen
\n
"
" s_mov_b32 s80, 0
\n
"
" s_mov_b32 s80, 0
\n
"
//---------------------v26-33 no need
";----------------------------------------------
\n
"
// "s_nop 4\n"
"; -- prefetch A0
\n
"
"; -- prefetch A0
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"s_add_u32 m0, 0, %[s_m0_init]
\n
"
"buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds
\n
"
"buffer_load_dword %[v_os_a0], s[20:23], 0 offen lds
\n
"
...
@@ -183,18 +177,17 @@
...
@@ -183,18 +177,17 @@
" s_waitcnt vmcnt(40)
\n
"
" s_waitcnt vmcnt(40)
\n
"
" s_barrier
\n
"
" s_barrier
\n
"
///////////////////////////////
";----------------------------------------------
\n
"
"ds_read_b128 v[192:195], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
// 1024: N stride, 64 K stride
"ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
// 1024: N stride, 64 K stride
"ds_read_b128 v[196:199], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
"ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
"ds_read_b128 v[200:203], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
"ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
"ds_read_b128 v[204:207], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
"ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
"ds_read_b128 v[208:211], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
"ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
"ds_read_b128 v[212:215], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
"ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
"ds_read_b128 v[216:219], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
"ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
"ds_read_b128 v[220:223], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
"ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
////////////////////////////
";----------------------------------------------
\n
"
" label_start:
\n
"
"label_start:
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
" s_barrier
\n
"
_UK_MFMA_
" v[128:131], acc[0:1], v[192:193], v[128:131]
\n
"
_UK_MFMA_
" v[128:131], acc[0:1], v[192:193], v[128:131]
\n
"
...
@@ -400,7 +393,7 @@
...
@@ -400,7 +393,7 @@
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_waitcnt vmcnt(24) & lgkmcnt(0)
\n
"
" s_barrier
\n
"
" s_barrier
\n
"
_UK_MFMA_
" v[128:131], acc[128:129], v[224:225], v[128:131]
\n
"
_UK_MFMA_
" v[128:131], acc[128:129], v[224:225], v[128:131]
\n
"
_UK_MFMA_ "
v
[
128
:
131
],
acc
[
130
:
131
],
v
[
226
:
227
],
v
[
128
:
131
]
\
n
"
_UK_MFMA_
" v[128:131], acc[130:131], v[226:227], v[128:131]
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[24:27], 0 offen
\n
"
_UK_MFMA_
" v[128:131], acc[132:133], v[228:229], v[128:131]
\n
"
_UK_MFMA_
" v[128:131], acc[132:133], v[228:229], v[128:131]
\n
"
_UK_MFMA_
" v[128:131], acc[134:135], v[230:231], v[128:131]
\n
"
_UK_MFMA_
" v[128:131], acc[134:135], v[230:231], v[128:131]
\n
"
...
@@ -461,49 +454,49 @@
...
@@ -461,49 +454,49 @@
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[24:27], 0 offen
\n
"
_UK_MFMA_
" v[144:147], acc[164:165], v[228:229], v[144:147]
\n
"
_UK_MFMA_
" v[144:147], acc[164:165], v[228:229], v[144:147]
\n
"
_UK_MFMA_
" v[144:147], acc[166:167], v[230:231], v[144:147]
\n
"
_UK_MFMA_
" v[144:147], acc[166:167], v[230:231], v[144:147]
\n
"
"
ds_read_b128
v
[
192
:
195
],
%
[
v_os_sld
]
offset
:
0
*%
[
smem_sz
]
+
%
[
sld_os_0
]
" ds_read_b128 v[192:195], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_0]
\n
"
_UK_MFMA_
" v[144:147], acc[168:169], v[232:233], v[144:147]
\n
"
_UK_MFMA_
" v[144:147], acc[168:169], v[232:233], v[144:147]
\n
"
_UK_MFMA_
" v[144:147], acc[170:171], v[234:235], v[144:147]
\n
"
_UK_MFMA_
" v[144:147], acc[170:171], v[234:235], v[144:147]
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
" v[144:147], acc[172:173], v[236:237], v[144:147]
\n
"
_UK_MFMA_
" v[144:147], acc[172:173], v[236:237], v[144:147]
\n
"
_UK_MFMA_
" v[144:147], acc[174:175], v[238:239], v[144:147]
\n
"
_UK_MFMA_
" v[144:147], acc[174:175], v[238:239], v[144:147]
\n
"
" ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1]
" ds_read_b128 v[196:199], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_1]
\n
"
_UK_MFMA_
" v[148:151], acc[160:161], v[240:241], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[160:161], v[240:241], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[162:163], v[242:243], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[162:163], v[242:243], v[148:151]
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
" v[148:151], acc[164:165], v[244:245], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[164:165], v[244:245], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[166:167], v[246:247], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[166:167], v[246:247], v[148:151]
\n
"
"
ds_read_b128
v
[
200
:
203
],
%
[
v_os_sld
]
offset
:
0
*%
[
smem_sz
]
+
%
[
sld_os_2
]
" ds_read_b128 v[200:203], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_2]
\n
"
_UK_MFMA_
" v[148:151], acc[168:169], v[248:249], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[168:169], v[248:249], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[170:171], v[250:251], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[170:171], v[250:251], v[148:151]
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
" v[148:151], acc[172:173], v[252:253], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[172:173], v[252:253], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[174:175], v[254:255], v[148:151]
\n
"
_UK_MFMA_
" v[148:151], acc[174:175], v[254:255], v[148:151]
\n
"
" ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3]
" ds_read_b128 v[204:207], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_3]
\n
"
_UK_MFMA_
" v[152:155], acc[176:177], v[224:225], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[176:177], v[224:225], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[178:179], v[226:227], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[178:179], v[226:227], v[152:155]
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen
\n
"
" buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[24:27], 0 offen
\n
"
_UK_MFMA_
" v[152:155], acc[180:181], v[228:229], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[180:181], v[228:229], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[182:183], v[230:231], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[182:183], v[230:231], v[152:155]
\n
"
"
ds_read_b128
v
[
208
:
211
],
%
[
v_os_sld
]
offset
:
0
*%
[
smem_sz
]
+
%
[
sld_os_4
]
" ds_read_b128 v[208:211], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_4]
\n
"
_UK_MFMA_
" v[152:155], acc[184:185], v[232:233], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[184:185], v[232:233], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[186:187], v[234:235], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[186:187], v[234:235], v[152:155]
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024
\n
"
" buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[24:27], 0 offen offset:1024
\n
"
_UK_MFMA_
" v[152:155], acc[188:189], v[236:237], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[188:189], v[236:237], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[190:191], v[238:239], v[152:155]
\n
"
_UK_MFMA_
" v[152:155], acc[190:191], v[238:239], v[152:155]
\n
"
" ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5]
" ds_read_b128 v[212:215], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_5]
\n
"
_UK_MFMA_
" v[156:159], acc[176:177], v[240:241], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[176:177], v[240:241], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[178:179], v[242:243], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[178:179], v[242:243], v[156:159]
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048
\n
"
" buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[24:27], 0 offen offset:2048
\n
"
_UK_MFMA_
" v[156:159], acc[180:181], v[244:245], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[180:181], v[244:245], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[182:183], v[246:247], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[182:183], v[246:247], v[156:159]
\n
"
"
ds_read_b128
v
[
216
:
219
],
%
[
v_os_sld
]
offset
:
0
*%
[
smem_sz
]
+
%
[
sld_os_6
]
" ds_read_b128 v[216:219], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_6]
\n
"
_UK_MFMA_
" v[156:159], acc[184:185], v[248:249], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[184:185], v[248:249], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[186:187], v[250:251], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[186:187], v[250:251], v[156:159]
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072
\n
"
" buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[24:27], 0 offen offset:3072
\n
"
_UK_MFMA_
" v[156:159], acc[188:189], v[252:253], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[188:189], v[252:253], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[190:191], v[254:255], v[156:159]
\n
"
_UK_MFMA_
" v[156:159], acc[190:191], v[254:255], v[156:159]
\n
"
" ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7]
" ds_read_b128 v[220:223], %[v_os_sld] offset:0*%[smem_sz] + %[sld_os_7]
\n
"
" s_waitcnt vmcnt(32)
\n
"
" s_waitcnt vmcnt(32)
\n
"
_UK_MFMA_
" v[160:163], acc[192:193], v[224:225], v[160:163]
\n
"
_UK_MFMA_
" v[160:163], acc[192:193], v[224:225], v[160:163]
\n
"
_UK_MFMA_
" v[160:163], acc[194:195], v[226:227], v[160:163]
\n
"
_UK_MFMA_
" v[160:163], acc[194:195], v[226:227], v[160:163]
\n
"
...
@@ -601,7 +594,7 @@
...
@@ -601,7 +594,7 @@
" s_cbranch_scc0 label_end
\n
"
" s_cbranch_scc0 label_end
\n
"
" s_branch label_start%=
\n
"
" s_branch label_start%=
\n
"
" label_end :
\n
"
" label_end :
\n
"
//dequant
";----------------------------------------------
\n
"
" v_cvt_f32_i32 v128, v128
\n
"
" v_cvt_f32_i32 v128, v128
\n
"
" v_cvt_f32_i32 v129, v129
\n
"
" v_cvt_f32_i32 v129, v129
\n
"
" v_cvt_f32_i32 v130, v130
\n
"
" v_cvt_f32_i32 v130, v130
\n
"
...
@@ -794,7 +787,7 @@
...
@@ -794,7 +787,7 @@
" v_mul_f32 v189, v17, v189 row_newbcast:13
\n
"
" v_mul_f32 v189, v17, v189 row_newbcast:13
\n
"
" v_mul_f32 v190, v17, v190 row_newbcast:14
\n
"
" v_mul_f32 v190, v17, v190 row_newbcast:14
\n
"
" v_mul_f32 v191, v17, v191 row_newbcast:15
\n
"
" v_mul_f32 v191, v17, v191 row_newbcast:15
\n
"
#undef _UK_MFMA_
#undef _UK_MFMA_
//dequant end
#undef _DEQUAN_CVT_
include/ck_tile/ops/fused_moe.hpp
View file @
a759277d
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_ex.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_problem.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp"
#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_traits.hpp"
#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp"
#include "ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp"
...
...
include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk_int8.hpp
View file @
a759277d
...
@@ -198,7 +198,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
...
@@ -198,7 +198,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
//addr in fact
//addr in fact
auto
a_coords
=
generate_tuple
(
auto
a_coords
=
generate_tuple
(
[
&
](
auto
i
)
{
[
&
](
auto
i
)
{
return
(
token_id
)
*
kargs
.
stride_token
+
return
(
token_id
[
i
]
)
*
kargs
.
stride_token
+
threadIdx
.
x
%
(
BlockShape
::
Block_K0
/
kAlignmentA
)
*
kAlignmentA
;
threadIdx
.
x
%
(
BlockShape
::
Block_K0
/
kAlignmentA
)
*
kAlignmentA
;
},
},
number
<
row_ids_a
.
size
()
>
{});
number
<
row_ids_a
.
size
()
>
{});
...
@@ -254,7 +254,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
...
@@ -254,7 +254,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
make_tuple
(
shared_intermediate_size_1
),
make_tuple
(
shared_intermediate_size_1
),
number
<
1
>
{});
number
<
1
>
{});
return
g_view_
;
return
g
q
_view_
;
}();
}();
auto
gq_res
=
gq_win
.
get_buffer_view
().
cached_buf_res_
;
auto
gq_res
=
gq_win
.
get_buffer_view
().
cached_buf_res_
;
...
@@ -345,7 +345,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
...
@@ -345,7 +345,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
auto
o_coords
=
generate_tuple
(
auto
o_coords
=
generate_tuple
(
[
&
](
auto
i
)
{
[
&
](
auto
i
)
{
return
token_id
*
kargs
.
stride_token
+
return
token_id
[
i
]
*
kargs
.
stride_token
+
threadIdx
.
x
%
(
BlockShape
::
Block_N1
/
kAlignmentO
)
*
kAlignmentO
;
threadIdx
.
x
%
(
BlockShape
::
Block_N1
/
kAlignmentO
)
*
kAlignmentO
;
},
},
number
<
row_ids_a
.
size
()
>
{});
number
<
row_ids_a
.
size
()
>
{});
...
@@ -376,6 +376,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
...
@@ -376,6 +376,7 @@ struct FusedMoeGemmPipeline_FlatmmUk_int8
row_ids_a
,
//fake token id, 2D index for X scale
row_ids_a
,
//fake token id, 2D index for X scale
aq_res
,
aq_res
,
gq_res
,
gq_res
,
gq_res
,
dq_res
,
dq_res
,
a_res
,
a_res
,
a_coords
,
a_coords
,
...
...
include/ck_tile/ops/gemm/warp/warp_gemm.hpp
View file @
a759277d
...
@@ -143,7 +143,7 @@ using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
...
@@ -143,7 +143,7 @@ using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
// int8
// int8
using
WarpGemmMfma_i32_16x16x64_int8_int8_CTransposed
=
using
WarpGemmMfma_i32_16x16x64_int8_int8_CTransposed
=
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
<
WarpGemmImpl
<
WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution
<
WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
<
WGAttrCtlEnum
::
Default_
>
,
WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
<
WGAttrCtlEnum
::
Default_
>
,
2
>>
;
2
>>
;
}
// namespace ck_tile
}
// namespace ck_tile
include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
View file @
a759277d
...
@@ -655,7 +655,7 @@ struct WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
...
@@ -655,7 +655,7 @@ struct WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
else
else
{
{
#if defined(__gfx94__)
#if defined(__gfx94__)
c_vec
=
__builtin_amdgcn_mfma_i32_16x16x32i8
(
c_vec
=
__builtin_amdgcn_mfma_i32_16x16x32
_
i8
(
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
c_vec
,
0
,
0
,
0
);
bit_cast
<
long
>
(
a_vec
),
bit_cast
<
long
>
(
b_vec
),
c_vec
,
0
,
0
,
0
);
#elif defined(__gfx908__) || defined(__gfx90a__)
#elif defined(__gfx908__) || defined(__gfx90a__)
static_for
<
0
,
8
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
8
,
1
>
{}([
&
](
auto
k
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment