Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
dec32dc6
Commit
dec32dc6
authored
Jan 31, 2025
by
ThomasNing
Browse files
Finish the feature and merge with develop on the computeV2
parents
71352c44
c5fff071
Changes
215
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
267 additions
and
186 deletions
+267
-186
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp
...thquant/instances/moe_smoothquant_bf16_n4096_instance.cpp
+8
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp
...uant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp
+8
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp
...othquant/instances/moe_smoothquant_bf16_n512_instance.cpp
+9
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp
...uant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp
+7
-3
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp
...othquant/instances/moe_smoothquant_bf16_n768_instance.cpp
+7
-3
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
...thquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
+9
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp
...thquant/instances/moe_smoothquant_fp16_n1536_instance.cpp
+9
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp
...thquant/instances/moe_smoothquant_fp16_n2048_instance.cpp
+8
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp
...othquant/instances/moe_smoothquant_fp16_n256_instance.cpp
+7
-3
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp
...thquant/instances/moe_smoothquant_fp16_n3072_instance.cpp
+8
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp
...thquant/instances/moe_smoothquant_fp16_n4096_instance.cpp
+8
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp
...uant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp
+8
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp
...othquant/instances/moe_smoothquant_fp16_n512_instance.cpp
+9
-4
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp
...uant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp
+7
-3
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp
...othquant/instances/moe_smoothquant_fp16_n768_instance.cpp
+7
-3
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
.../14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
+55
-45
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
...smoothquant/instances/moe_smoothquant_instance_common.hpp
+12
-9
example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
+40
-28
example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
+14
-24
example/ck_tile/14_moe_smoothquant/script/smoke_test.sh
example/ck_tile/14_moe_smoothquant/script/smoke_test.sh
+27
-25
No files found.
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_instance.cpp
View file @
dec32dc6
...
...
@@ -6,9 +6,13 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
1
,
256
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
2
,
1
,
256
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
1
,
256
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n4096_tp_instance.cpp
View file @
dec32dc6
...
...
@@ -6,9 +6,13 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n512_instance.cpp
View file @
dec32dc6
...
...
@@ -6,8 +6,13 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n64_n128_instance.cpp
View file @
dec32dc6
...
...
@@ -6,7 +6,11 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n768_instance.cpp
View file @
dec32dc6
...
...
@@ -6,7 +6,11 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
View file @
dec32dc6
...
...
@@ -15,8 +15,13 @@ template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 16, 4, 64, 1, true
template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 1, 1, 256, 4, true ,false>>(const S&, A);
#endif
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
2
,
128
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
2
,
128
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
2
,
128
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
1
,
256
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
1
,
2
,
128
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
2
,
2
,
128
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
4
,
2
,
128
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
256
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
1
,
2
,
128
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
2
,
128
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
2
,
128
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
256
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1536_instance.cpp
View file @
dec32dc6
...
...
@@ -6,8 +6,13 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
2
,
128
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
1
,
256
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
6
,
1
,
256
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
3
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
3
,
2
,
128
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
3
,
1
,
256
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
6
,
1
,
256
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
3
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
3
,
2
,
128
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
3
,
1
,
256
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
6
,
1
,
256
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n2048_instance.cpp
View file @
dec32dc6
...
...
@@ -6,9 +6,13 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
1
,
256
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
1
,
256
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
8
,
1
,
256
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
1
,
1
,
256
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
2
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
256
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
8
,
1
,
256
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
1
,
1
,
256
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
256
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
8
,
1
,
256
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n256_instance.cpp
View file @
dec32dc6
...
...
@@ -6,7 +6,11 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
1
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
2
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
4
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
1
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n3072_instance.cpp
View file @
dec32dc6
...
...
@@ -6,9 +6,13 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
1
,
128
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
6
,
1
,
256
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
1
,
1024
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
3
,
1
,
128
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
3
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
6
,
1
,
256
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
3
,
1
,
1024
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
3
,
1
,
128
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
3
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
6
,
1
,
256
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
3
,
1
,
1024
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_instance.cpp
View file @
dec32dc6
...
...
@@ -6,9 +6,13 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
1
,
256
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
2
,
1
,
256
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
1
,
256
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
256
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n4096_tp_instance.cpp
View file @
dec32dc6
...
...
@@ -6,9 +6,13 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n512_instance.cpp
View file @
dec32dc6
...
...
@@ -6,8 +6,13 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n64_n128_instance.cpp
View file @
dec32dc6
...
...
@@ -6,7 +6,11 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n768_instance.cpp
View file @
dec32dc6
...
...
@@ -6,7 +6,11 @@
// clang-format off
// rm rn tm tn vn pd 2p
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
moe_smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fwd_api.cpp
View file @
dec32dc6
...
...
@@ -4,7 +4,8 @@
#include <ck_tile/core.hpp>
#include "moe_smoothquant.hpp"
template
<
typename
DataType_
,
template
<
typename
InType
,
typename
OutType
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
ck_tile
::
index_t
Repeat_N_
,
// each thread repeat along N
ck_tile
::
index_t
ThreadPerBlock_M_
,
// num threads along M
...
...
@@ -12,7 +13,8 @@ template <typename DataType_,
ck_tile
::
index_t
Vector_N_
,
// vector size along N
bool
kPadN_
,
bool
kTwoPass_
>
using
trait_
=
moe_smoothquant_traits_
<
DataType_
,
using
trait_
=
moe_smoothquant_traits_
<
InType
,
OutType
,
Repeat_M_
,
Repeat_N_
,
ThreadPerBlock_M_
,
...
...
@@ -21,108 +23,108 @@ using trait_ = moe_smoothquant_traits_<DataType_,
kPadN_
,
kTwoPass_
>
;
template
<
typename
data
_type
>
template
<
typename
in_type
,
typename
out
_type
>
float
moe_smoothquant_dispatch
(
moe_smoothquant_traits
/*t*/
,
moe_smoothquant_args
a
,
const
ck_tile
::
stream_config
&
s
)
{
float
r
=
-
1
;
// clang-format off
// rm rn tm tn vn pd 2p
//
rm rn tm tn vn pd 2p
if
(
a
.
hidden_size
<=
64
)
{
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
<=
128
)
{
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
<=
256
)
{
if
(
a
.
hidden_size
%
4
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
1
,
4
,
64
,
4
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
1
,
4
,
64
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
2
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
2
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
4
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
4
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
<=
512
)
{
if
(
a
.
hidden_size
%
8
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
4
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
<=
768
)
{
if
(
a
.
hidden_size
%
4
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
<=
1024
)
{
if
(
a
.
hidden_size
%
8
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
1
,
2
,
128
,
8
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
1
,
2
,
128
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
4
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
2
,
2
,
128
,
4
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
2
,
2
,
128
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
4
,
2
,
128
,
2
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
4
,
2
,
128
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
4
,
1
,
256
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
4
,
1
,
256
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
<=
1536
)
{
if
(
a
.
hidden_size
%
8
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
3
,
4
,
64
,
8
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
3
,
4
,
64
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
4
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
3
,
2
,
128
,
4
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
3
,
2
,
128
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
3
,
1
,
256
,
2
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
3
,
1
,
256
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
6
,
1
,
256
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
6
,
1
,
256
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
<=
2048
)
{
if
(
a
.
hidden_size
%
8
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
1
,
1
,
256
,
8
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
1
,
1
,
256
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
4
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
2
,
1
,
256
,
4
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
2
,
1
,
256
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
4
,
1
,
256
,
2
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
4
,
1
,
256
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
8
,
1
,
256
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
8
,
1
,
256
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
<=
3072
)
{
if
(
a
.
hidden_size
%
8
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
3
,
1
,
128
,
8
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
3
,
1
,
128
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
4
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
3
,
1
,
256
,
4
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
3
,
1
,
256
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
6
,
1
,
256
,
2
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
6
,
1
,
256
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
3
,
1
,
1024
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
3
,
1
,
1024
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
<=
4096
)
{
if
(
a
.
hidden_size
%
8
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
2
,
1
,
256
,
8
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
2
,
1
,
256
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
4
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
4
,
1
,
256
,
4
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
4
,
1
,
256
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
hidden_size
>
4096
)
{
if
(
a
.
hidden_size
%
8
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
4
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>>
(
s
,
a
);
else
if
(
a
.
hidden_size
%
2
==
0
)
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>>
(
s
,
a
);
else
r
=
moe_smoothquant_
<
trait_
<
data
_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>>
(
s
,
a
);
r
=
moe_smoothquant_
<
trait_
<
in_type
,
out
_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>>
(
s
,
a
);
}
return
r
;
// clang-format on
...
...
@@ -132,13 +134,21 @@ float moe_smoothquant(moe_smoothquant_traits t,
moe_smoothquant_args
a
,
const
ck_tile
::
stream_config
&
s
)
{
if
(
t
.
data
_type
.
compare
(
"fp16"
)
==
0
)
if
(
t
.
in
_type
.
compare
(
"fp16"
)
==
0
&&
t
.
out_type
==
"int8"
)
{
return
moe_smoothquant_dispatch
<
ck_tile
::
fp16_t
>
(
t
,
a
,
s
);
return
moe_smoothquant_dispatch
<
ck_tile
::
fp16_t
,
ck_tile
::
int8_t
>
(
t
,
a
,
s
);
}
else
if
(
t
.
data
_type
.
compare
(
"
b
f16"
)
==
0
)
else
if
(
t
.
in
_type
.
compare
(
"f
p
16"
)
==
0
&&
t
.
out_type
==
"fp8"
)
{
return
moe_smoothquant_dispatch
<
ck_tile
::
bf16_t
>
(
t
,
a
,
s
);
return
moe_smoothquant_dispatch
<
ck_tile
::
fp16_t
,
ck_tile
::
fp8_t
>
(
t
,
a
,
s
);
}
else
if
(
t
.
in_type
.
compare
(
"bf16"
)
==
0
&&
t
.
out_type
==
"int8"
)
{
return
moe_smoothquant_dispatch
<
ck_tile
::
bf16_t
,
ck_tile
::
int8_t
>
(
t
,
a
,
s
);
}
else
if
(
t
.
in_type
.
compare
(
"bf16"
)
==
0
&&
t
.
out_type
==
"fp8"
)
{
return
moe_smoothquant_dispatch
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
>
(
t
,
a
,
s
);
}
else
throw
std
::
runtime_error
(
"Without supported instances!"
);
...
...
example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_instance_common.hpp
View file @
dec32dc6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#include <ck_tile/core.hpp>
#include "moe_smoothquant.hpp"
...
...
@@ -11,7 +11,8 @@
using
S
=
ck_tile
::
stream_config
;
using
A
=
moe_smoothquant_args
;
template
<
typename
DataType_
,
template
<
typename
InputType_
,
typename
OutputType_
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
ck_tile
::
index_t
Repeat_N_
,
// each thread repeat along N
ck_tile
::
index_t
ThreadPerBlock_M_
,
// num threads along M
...
...
@@ -19,7 +20,8 @@ template <typename DataType_,
ck_tile
::
index_t
Vector_N_
,
// vector size along N
bool
kPadN_
,
bool
kTwoPass_
>
using
trait_
=
moe_smoothquant_traits_
<
DataType_
,
using
trait_
=
moe_smoothquant_traits_
<
InputType_
,
OutputType_
,
Repeat_M_
,
Repeat_N_
,
ThreadPerBlock_M_
,
...
...
@@ -31,14 +33,15 @@ using trait_ = moe_smoothquant_traits_<DataType_,
template
<
typename
Traits_
>
float
moe_smoothquant_
(
const
S
&
s
,
A
a
)
{
using
DataType
=
typename
Traits_
::
DataType
;
using
InputType
=
typename
Traits_
::
InputType
;
using
OutputType
=
typename
Traits_
::
OutputType
;
using
PipelineProblem
=
ck_tile
::
SmoothquantPipelineProblem
<
typename
MoeSmoothquantTypeConfig
<
Data
Type
>::
XDataType
,
typename
MoeSmoothquantTypeConfig
<
Data
Type
>::
X
ScaleDataType
,
typename
MoeSmoothquantTypeConfig
<
Data
Type
>::
ComputeDataType
,
typename
MoeSmoothquantTypeConfig
<
Data
Type
>::
YScaleDataType
,
typename
MoeSmoothquantTypeConfig
<
Data
Type
>::
QYDataType
,
typename
MoeSmoothquantTypeConfig
<
InputType
,
Output
Type
>::
XDataType
,
typename
MoeSmoothquantTypeConfig
<
InputType
,
Output
Type
>::
Smooth
ScaleDataType
,
typename
MoeSmoothquantTypeConfig
<
InputType
,
Output
Type
>::
ComputeDataType
,
typename
MoeSmoothquantTypeConfig
<
InputType
,
Output
Type
>::
YScaleDataType
,
typename
MoeSmoothquantTypeConfig
<
InputType
,
Output
Type
>::
QYDataType
,
typename
Traits_
::
Shape
,
Traits_
::
kPadN
,
Traits_
::
kTwoPass
>
;
...
...
example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
View file @
dec32dc6
...
...
@@ -63,7 +63,8 @@ auto create_args(int argc, char* argv[])
.
insert
(
"stride"
,
"-1"
,
"stride per row, if -1 then equal to hidden_size"
)
.
insert
(
"v"
,
"1"
,
"cpu validation or not"
)
.
insert
(
"kname"
,
"1"
,
"print kernel name or not"
)
.
insert
(
"prec"
,
"fp16"
,
"precision"
)
.
insert
(
"prec_i"
,
"fp16"
,
"input precision, fp16/bf16"
)
.
insert
(
"prec_o"
,
"int8"
,
"precision, int8/fp8"
)
.
insert
(
"warmup"
,
"5"
,
"cold iter"
)
.
insert
(
"repeat"
,
"20"
,
"hot iter"
);
...
...
@@ -71,7 +72,7 @@ auto create_args(int argc, char* argv[])
return
std
::
make_tuple
(
result
,
arg_parser
);
}
template
<
typename
Data
Type
>
template
<
typename
InputType
,
typename
Output
Type
>
bool
run
(
const
ck_tile
::
ArgParser
&
arg_parser
)
{
ck_tile
::
index_t
tokens
=
arg_parser
.
get_int
(
"t"
);
...
...
@@ -81,7 +82,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
stride
=
hidden_size
;
ck_tile
::
index_t
experts
=
arg_parser
.
get_int
(
"e"
);
ck_tile
::
index_t
topk
=
arg_parser
.
get_int
(
"k"
);
std
::
string
data_type
=
arg_parser
.
get_str
(
"prec"
);
std
::
string
prec_i
=
arg_parser
.
get_str
(
"prec_i"
);
std
::
string
prec_o
=
arg_parser
.
get_str
(
"prec_o"
);
int
kname
=
arg_parser
.
get_int
(
"kname"
);
int
do_validation
=
arg_parser
.
get_int
(
"v"
);
int
warmup
=
arg_parser
.
get_int
(
"warmup"
);
...
...
@@ -89,17 +91,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
assert
(
stride
>=
hidden_size
);
using
TypeConfig
=
MoeSmoothquantTypeConfig
<
Data
Type
>
;
using
TypeConfig
=
MoeSmoothquantTypeConfig
<
InputType
,
Output
Type
>
;
using
XDataType
=
typename
TypeConfig
::
XDataType
;
using
X
ScaleDataType
=
typename
TypeConfig
::
X
ScaleDataType
;
using
YScaleDataType
=
typename
TypeConfig
::
YScaleDataType
;
using
QYDataType
=
typename
TypeConfig
::
QYDataType
;
using
ComputeDataType
=
typename
TypeConfig
::
ComputeDataType
;
using
XDataType
=
typename
TypeConfig
::
XDataType
;
using
Smooth
ScaleDataType
=
typename
TypeConfig
::
Smooth
ScaleDataType
;
using
YScaleDataType
=
typename
TypeConfig
::
YScaleDataType
;
using
QYDataType
=
typename
TypeConfig
::
QYDataType
;
using
ComputeDataType
=
typename
TypeConfig
::
ComputeDataType
;
// host verify
ck_tile
::
HostTensor
<
XDataType
>
x_host
({
tokens
,
hidden_size
},
{
stride
,
1
});
ck_tile
::
HostTensor
<
X
ScaleDataType
>
x
scale_host
({
experts
*
hidden_size
});
ck_tile
::
HostTensor
<
Smooth
ScaleDataType
>
sm
scale_host
({
experts
*
hidden_size
});
ck_tile
::
HostTensor
<
ck_tile
::
index_t
>
topk_ids_host
({
tokens
,
topk
});
ck_tile
::
HostTensor
<
YScaleDataType
>
yscale_host_ref
({
topk
*
tokens
},
{
1
});
...
...
@@ -110,26 +112,26 @@ bool run(const ck_tile::ArgParser& arg_parser)
topid_unique_gen
<
ck_tile
::
index_t
>
(
topk_ids_host
.
mData
,
tokens
,
topk
,
experts
,
11937
);
ck_tile
::
FillUniformDistribution
<
XDataType
>
{
-
.5
f
,
.5
f
}(
x_host
);
ck_tile
::
FillUniformDistribution
<
X
ScaleDataType
>
{
1e-3
,
.5
f
}(
x
scale_host
);
ck_tile
::
FillUniformDistribution
<
Smooth
ScaleDataType
>
{
1e-3
,
.5
f
}(
sm
scale_host
);
ck_tile
::
DeviceMem
x_buf
(
x_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
x
scale_buf
(
x
scale_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
sm
scale_buf
(
sm
scale_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
topk_ids_buf
(
topk_ids_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
yscale_buf
(
yscale_host_dev
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
qy_buf
(
qy_host_dev
.
get_element_space_size_in_bytes
());
x_buf
.
ToDevice
(
x_host
.
data
());
x
scale_buf
.
ToDevice
(
x
scale_host
.
data
());
sm
scale_buf
.
ToDevice
(
sm
scale_host
.
data
());
topk_ids_buf
.
ToDevice
(
topk_ids_host
.
data
());
std
::
cout
<<
"["
<<
data_type
<<
"]"
std
::
cout
<<
"["
<<
prec_i
<<
"-"
<<
prec_o
<<
"]"
<<
" tokens:"
<<
tokens
<<
", hidden_size:"
<<
hidden_size
<<
", stride:"
<<
stride
<<
", experts:"
<<
experts
<<
", topk:"
<<
topk
<<
std
::
flush
;
moe_smoothquant_traits
traits
{
data_type
};
moe_smoothquant_traits
traits
{
prec_i
,
prec_o
};
moe_smoothquant_args
args
{
x_buf
.
GetDeviceBuffer
(),
x
scale_buf
.
GetDeviceBuffer
(),
sm
scale_buf
.
GetDeviceBuffer
(),
topk_ids_buf
.
GetDeviceBuffer
(),
yscale_buf
.
GetDeviceBuffer
(),
qy_buf
.
GetDeviceBuffer
(),
...
...
@@ -143,9 +145,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
float
ave_time
=
moe_smoothquant
(
traits
,
args
,
ck_tile
::
stream_config
{
nullptr
,
true
,
kname
?
1
:
0
,
warmup
,
repeat
});
std
::
size_t
num_byte
=
sizeof
(
XDataType
)
*
tokens
*
hidden_size
+
sizeof
(
XScaleDataType
)
*
topk
*
hidden_size
+
sizeof
(
YScaleDataType
)
*
topk
*
tokens
+
sizeof
(
QYDataType
)
*
topk
*
tokens
*
hidden_size
;
std
::
size_t
num_byte
=
sizeof
(
XDataType
)
*
tokens
*
hidden_size
+
sizeof
(
SmoothScaleDataType
)
*
topk
*
hidden_size
+
sizeof
(
YScaleDataType
)
*
topk
*
tokens
+
sizeof
(
QYDataType
)
*
topk
*
tokens
*
hidden_size
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
std
::
cout
<<
", "
<<
ave_time
*
1.E3
<<
" us, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
flush
;
...
...
@@ -165,11 +168,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
for
(
int
i_h
=
0
;
i_h
<
hidden_size
;
++
i_h
)
{
auto
v_
x
scale
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
x
scale_host
(
i_expert
*
hidden_size
+
i_h
));
auto
v_
sm
scale
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
sm
scale_host
(
i_expert
*
hidden_size
+
i_h
));
auto
v_x
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
x_host
(
i_token
,
i_h
));
// y_host(i_token * topk + i_topk, i_h) = v_x * v_
x
scale;
y_host
(
i_topk
*
tokens
+
i_token
,
i_h
)
=
v_x
*
v_
x
scale
;
// y_host(i_token * topk + i_topk, i_h) = v_x * v_
sm
scale;
y_host
(
i_topk
*
tokens
+
i_token
,
i_h
)
=
v_x
*
v_
sm
scale
;
}
}
};
...
...
@@ -250,14 +253,23 @@ int main(int argc, char* argv[])
if
(
!
result
)
return
-
1
;
const
std
::
string
data_type
=
arg_parser
.
get_str
(
"prec"
);
if
(
data_type
==
"fp16"
)
const
std
::
string
prec_i
=
arg_parser
.
get_str
(
"prec_i"
);
const
std
::
string
prec_o
=
arg_parser
.
get_str
(
"prec_o"
);
if
(
prec_i
==
"fp16"
&&
prec_o
==
"int8"
)
{
return
run
<
ck_tile
::
half_t
,
ck_tile
::
int8_t
>
(
arg_parser
)
?
0
:
-
2
;
}
else
if
(
prec_i
==
"fp16"
&&
prec_o
==
"fp8"
)
{
return
run
<
ck_tile
::
half_t
,
ck_tile
::
fp8_t
>
(
arg_parser
)
?
0
:
-
2
;
}
else
if
(
prec_i
==
"bf16"
&&
prec_o
==
"int8"
)
{
return
run
<
ck_tile
::
half
_t
>
(
arg_parser
)
?
0
:
-
2
;
return
run
<
ck_tile
::
bf16_t
,
ck_tile
::
int8
_t
>
(
arg_parser
)
?
0
:
-
2
;
}
else
if
(
data_type
==
"bf16
"
)
else
if
(
prec_i
==
"bf16"
&&
prec_o
==
"fp8
"
)
{
return
run
<
ck_tile
::
bf16_t
>
(
arg_parser
)
?
0
:
-
2
;
return
run
<
ck_tile
::
bf16_t
,
ck_tile
::
fp8_t
>
(
arg_parser
)
?
0
:
-
2
;
}
return
-
3
;
...
...
example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
View file @
dec32dc6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -8,27 +8,14 @@
#include "ck_tile/ops/smoothquant.hpp"
#include <string>
template
<
typename
DataType
>
struct
MoeSmoothquantTypeConfig
;
template
<
>
struct
MoeSmoothquantTypeConfig
<
ck_tile
::
half_t
>
{
using
XDataType
=
ck_tile
::
half_t
;
using
XScaleDataType
=
float
;
using
YScaleDataType
=
float
;
using
QYDataType
=
ck_tile
::
int8_t
;
using
ComputeDataType
=
float
;
};
template
<
>
struct
MoeSmoothquantTypeConfig
<
ck_tile
::
bf16_t
>
template
<
typename
InputType
,
typename
OutputType
>
struct
MoeSmoothquantTypeConfig
{
using
XDataType
=
ck_tile
::
bf16_t
;
using
X
ScaleDataType
=
float
;
using
YScaleDataType
=
float
;
using
QYDataType
=
ck_tile
::
int8_t
;
using
ComputeDataType
=
float
;
using
XDataType
=
InputType
;
using
Smooth
ScaleDataType
=
float
;
using
YScaleDataType
=
float
;
using
QYDataType
=
OutputType
;
using
ComputeDataType
=
float
;
};
// runtime args
...
...
@@ -37,7 +24,8 @@ struct moe_smoothquant_args : public ck_tile::MoeSmoothquantHostArgs
};
// this is used to pattern-match internl kernel implementation, not to instantiate kernel
template
<
typename
DataType_
,
template
<
typename
InputType_
,
typename
OutputType_
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
ck_tile
::
index_t
Repeat_N_
,
// each thread repeat along N
ck_tile
::
index_t
ThreadPerBlock_M_
,
// num threads along M
...
...
@@ -47,7 +35,8 @@ template <typename DataType_,
bool
kTwoPass_
>
struct
moe_smoothquant_traits_
{
using
DataType
=
ck_tile
::
remove_cvref_t
<
DataType_
>
;
using
InputType
=
ck_tile
::
remove_cvref_t
<
InputType_
>
;
using
OutputType
=
ck_tile
::
remove_cvref_t
<
OutputType_
>
;
static
constexpr
bool
is_warp_per_row
=
ThreadPerBlock_N_
<=
warpSize
;
static_assert
((
ThreadPerBlock_M_
*
ThreadPerBlock_N_
)
%
warpSize
==
0
);
...
...
@@ -108,7 +97,8 @@ float moe_smoothquant_(const ck_tile::stream_config& s, moe_smoothquant_args a);
// This is the public API, will be generated by script
struct
moe_smoothquant_traits
{
std
::
string
data_type
;
std
::
string
in_type
;
// input type
std
::
string
out_type
;
// output type
};
float
moe_smoothquant
(
moe_smoothquant_traits
,
moe_smoothquant_args
,
const
ck_tile
::
stream_config
&
);
example/ck_tile/14_moe_smoothquant/script/smoke_test.sh
View file @
dec32dc6
...
...
@@ -2,29 +2,31 @@
EXE
=
build/bin/tile_example_moe_smoothquant
for
pr_i
in
"fp16"
"bf16"
;
do
$EXE
-prec
=
$pr_i
-t
=
99
-h
=
13
$EXE
-prec
=
$pr_i
-t
=
17
-h
=
16
$EXE
-prec
=
$pr_i
-t
=
1
-h
=
100
$EXE
-prec
=
$pr_i
-t
=
4
-h
=
128
$EXE
-prec
=
$pr_i
-t
=
80
-h
=
127
$EXE
-prec
=
$pr_i
-t
=
22
-h
=
255
-stride
=
256
$EXE
-prec
=
$pr_i
-t
=
7
-h
=
599
$EXE
-prec
=
$pr_i
-t
=
19
-h
=
512
$EXE
-prec
=
$pr_i
-t
=
33
-h
=
313
-stride
=
1000
$EXE
-prec
=
$pr_i
-t
=
11
-h
=
510
$EXE
-prec
=
$pr_i
-t
=
171
-h
=
676
-stride
=
818
$EXE
-prec
=
$pr_i
-t
=
91
-h
=
636
$EXE
-prec
=
$pr_i
-t
=
12
-h
=
768
-stride
=
800
$EXE
-prec
=
$pr_i
-t
=
100
-h
=
766
-stride
=
812
$EXE
-prec
=
$pr_i
-t
=
31
-h
=
1024
$EXE
-prec
=
$pr_i
-t
=
64
-h
=
1000
-stride
=
1004
$EXE
-prec
=
$pr_i
-t
=
8
-h
=
1501
$EXE
-prec
=
$pr_i
-t
=
3
-h
=
1826
$EXE
-prec
=
$pr_i
-t
=
5
-h
=
2040
$EXE
-prec
=
$pr_i
-t
=
7
-h
=
2734
$EXE
-prec
=
$pr_i
-t
=
1
-h
=
3182
$EXE
-prec
=
$pr_i
-t
=
9
-h
=
4096
$EXE
-prec
=
$pr_i
-t
=
3
-h
=
8192
$EXE
-prec
=
$pr_i
-t
=
1
-h
=
10547
$EXE
-prec
=
$pr_i
-t
=
3
-h
=
17134
for
pr_o
in
"int8"
"fp8"
;
do
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
99
-h
=
13
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
17
-h
=
16
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
1
-h
=
100
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
4
-h
=
128
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
80
-h
=
127
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
22
-h
=
255
-stride
=
256
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
7
-h
=
599
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
19
-h
=
512
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
33
-h
=
313
-stride
=
1000
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
11
-h
=
510
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
171
-h
=
676
-stride
=
818
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
91
-h
=
636
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
12
-h
=
768
-stride
=
800
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
100
-h
=
766
-stride
=
812
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
31
-h
=
1024
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
64
-h
=
1000
-stride
=
1004
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
8
-h
=
1501
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
3
-h
=
1826
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
5
-h
=
2040
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
7
-h
=
2734
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
1
-h
=
3182
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
9
-h
=
4096
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
3
-h
=
8192
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
1
-h
=
10547
$EXE
-prec_i
=
$pr_i
-prec_o
=
$pr_o
-t
=
3
-h
=
17134
done
done
Prev
1
2
3
4
5
6
7
8
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment