Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
ead5167a
Commit
ead5167a
authored
Nov 01, 2024
by
dummycoderfe
Browse files
merge develop
parents
da1a2829
03c6448b
Changes
137
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
856 additions
and
19 deletions
+856
-19
example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp
...othquant/instances/smoothquant_fp16_n4096_tp_instance.cpp
+14
-0
example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp
..._smoothquant/instances/smoothquant_fp16_n512_instance.cpp
+13
-0
example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp
...othquant/instances/smoothquant_fp16_n64_n128_instance.cpp
+12
-0
example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp
..._smoothquant/instances/smoothquant_fp16_n768_instance.cpp
+12
-0
example/ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp
.../ck_tile/12_smoothquant/instances/smoothquant_fwd_api.cpp
+143
-0
example/ck_tile/12_smoothquant/instances/smoothquant_instance_common.hpp
.../12_smoothquant/instances/smoothquant_instance_common.hpp
+62
-0
example/ck_tile/12_smoothquant/script/perf_test.sh
example/ck_tile/12_smoothquant/script/perf_test.sh
+37
-0
example/ck_tile/12_smoothquant/script/smoke_test.sh
example/ck_tile/12_smoothquant/script/smoke_test.sh
+30
-0
example/ck_tile/12_smoothquant/smoothquant.cpp
example/ck_tile/12_smoothquant/smoothquant.cpp
+218
-0
example/ck_tile/12_smoothquant/smoothquant.hpp
example/ck_tile/12_smoothquant/smoothquant.hpp
+114
-0
example/ck_tile/13_moe_sorting/CMakeLists.txt
example/ck_tile/13_moe_sorting/CMakeLists.txt
+0
-0
example/ck_tile/13_moe_sorting/README.md
example/ck_tile/13_moe_sorting/README.md
+0
-0
example/ck_tile/13_moe_sorting/moe_sorting.cpp
example/ck_tile/13_moe_sorting/moe_sorting.cpp
+0
-0
example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+0
-0
example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+0
-0
example/ck_tile/13_moe_sorting/script/smoke_test.sh
example/ck_tile/13_moe_sorting/script/smoke_test.sh
+0
-0
example/ck_tile/CMakeLists.txt
example/ck_tile/CMakeLists.txt
+2
-1
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+94
-18
include/ck_tile/core.hpp
include/ck_tile/core.hpp
+1
-0
include/ck_tile/core/numeric/int8.hpp
include/ck_tile/core/numeric/int8.hpp
+104
-0
No files found.
example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n4096_tp_instance.cpp
0 → 100644
View file @
ead5167a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "smoothquant_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd 2p
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n512_instance.cpp
0 → 100644
View file @
ead5167a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "smoothquant_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd 2p
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n64_n128_instance.cpp
0 → 100644
View file @
ead5167a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "smoothquant_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd 2p
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n768_instance.cpp
0 → 100644
View file @
ead5167a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "smoothquant_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd 2p
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>
>
(
const
S
&
,
A
);
template
float
smoothquant_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/
0
2_
layernorm2d/instances/layernorm2d
_fwd_api.cpp
→
example/ck_tile/
1
2_
smoothquant/instances/smoothquant
_fwd_api.cpp
View file @
ead5167a
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <ck_tile/core.hpp>
#include "
layernorm2d_fwd
.hpp"
#include "
smoothquant
.hpp"
template
<
typename
DataType_
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
...
...
@@ -11,145 +11,133 @@ template <typename DataType_,
ck_tile
::
index_t
ThreadPerBlock_N_
,
// num threads along N
ck_tile
::
index_t
Vector_N_
,
// vector size along N
bool
kPadN_
,
bool
kSaveMeanInvStd_
,
bool
kTwoPass_
>
using
trait_
=
layernorm2d_fwd_traits_
<
DataType_
,
Repeat_M_
,
Repeat_N_
,
ThreadPerBlock_M_
,
ThreadPerBlock_N_
,
Vector_N_
,
kPadN_
,
kSaveMeanInvStd_
,
kTwoPass_
>
;
using
trait_
=
smoothquant_traits_
<
DataType_
,
Repeat_M_
,
Repeat_N_
,
ThreadPerBlock_M_
,
ThreadPerBlock_N_
,
Vector_N_
,
kPadN_
,
kTwoPass_
>
;
template
<
typename
data_type
>
float
layernorm2d_fwd_b16_
(
layernorm2d_fwd
_traits
/*t*/
,
layernorm2d_fwd
_args
a
,
float
smoothquant_dispatch
(
smoothquant
_traits
/*t*/
,
smoothquant
_args
a
,
const
ck_tile
::
stream_config
&
s
)
{
#if 1
float
r
=
-
1
;
// clang-format off
//
rm rn tm
tn vn pd
mv
2p
// rm rn tm tn vn
pd 2p
if
(
a
.
n
<=
64
)
{
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
128
)
{
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
2
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
2
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
256
)
{
if
(
a
.
n
%
4
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
2
,
4
,
64
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
2
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
4
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
4
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
512
)
{
if
(
a
.
n
%
8
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
2
,
4
,
64
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
2
,
4
,
64
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
4
,
4
,
64
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
4
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
8
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
8
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
768
)
{
if
(
a
.
n
%
4
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
3
,
4
,
64
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
3
,
4
,
64
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
6
,
4
,
64
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
6
,
4
,
64
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
12
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
12
,
4
,
64
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
1024
)
{
if
(
a
.
n
%
8
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
1
,
2
,
128
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
1
,
2
,
128
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
2
,
2
,
128
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
2
,
2
,
128
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
4
,
2
,
128
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
4
,
2
,
128
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
1536
)
{
if
(
a
.
n
%
8
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
3
,
4
,
64
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
3
,
4
,
64
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
3
,
2
,
128
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
3
,
2
,
128
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
3
,
1
,
256
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
3
,
1
,
256
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
6
,
1
,
256
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
6
,
1
,
256
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
2048
)
{
if
(
a
.
n
%
8
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
1
,
1
,
256
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
1
,
1
,
256
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
2
,
1
,
256
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
2
,
1
,
256
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
8
,
1
,
256
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
8
,
1
,
256
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
3072
)
{
if
(
a
.
n
%
8
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
3
,
1
,
128
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
3
,
1
,
128
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
3
,
1
,
256
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
3
,
1
,
256
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
6
,
1
,
256
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
6
,
1
,
256
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
3
,
1
,
1024
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
3
,
1
,
1024
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
4096
)
{
if
(
a
.
n
%
8
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
2
,
1
,
256
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
2
,
1
,
256
,
8
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
4
,
true
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
>
4096
)
{
if
(
a
.
n
%
8
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
2
,
1
,
256
,
8
,
true
,
false
,
true
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
2
,
1
,
256
,
8
,
true
,
true
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
4
,
true
,
false
,
true
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
4
,
true
,
true
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
,
true
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
true
>>
(
s
,
a
);
else
r
=
layernorm2d_fwd
_
<
trait_
<
data_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
,
true
>>
(
s
,
a
);
r
=
smoothquant
_
<
trait_
<
data_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
true
>>
(
s
,
a
);
}
return
r
;
#else
return
layernorm2d_fwd_
<
trait_
<
data_type
,
1
,
1
,
1
,
256
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
#endif
// clang-format on
}
float
layernorm2d_fwd
(
layernorm2d_fwd_traits
t
,
layernorm2d_fwd_args
a
,
const
ck_tile
::
stream_config
&
s
)
float
smoothquant
(
smoothquant_traits
t
,
smoothquant_args
a
,
const
ck_tile
::
stream_config
&
s
)
{
float
r
=
-
1
;
if
(
t
.
data_type
.
compare
(
"fp16"
)
==
0
)
{
return
layernorm2d_fwd_b16_
<
ck_tile
::
fp16_t
>
(
t
,
a
,
s
);
return
smoothquant_dispatch
<
ck_tile
::
fp16_t
>
(
t
,
a
,
s
);
}
else
if
(
t
.
data_type
.
compare
(
"bf16"
)
==
0
)
{
return
layernorm2d_fwd_b16_
<
ck_tile
::
bf16_t
>
(
t
,
a
,
s
);
return
smoothquant_dispatch
<
ck_tile
::
bf16_t
>
(
t
,
a
,
s
);
}
if
(
r
<
0
)
else
throw
std
::
runtime_error
(
"Without supported instances!"
);
return
r
;
}
example/ck_tile/
0
2_
layernorm2d/instances/layernorm2d_fwd
_instance_common.hpp
→
example/ck_tile/
1
2_
smoothquant/instances/smoothquant
_instance_common.hpp
View file @
ead5167a
...
...
@@ -3,13 +3,13 @@
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <ck_tile/core.hpp>
#include "
layernorm2d_fwd
.hpp"
#include "
smoothquant
.hpp"
#include <iostream>
#pragma once
using
S
=
ck_tile
::
stream_config
;
using
A
=
layernorm2d_fwd
_args
;
using
A
=
smoothquant
_args
;
template
<
typename
DataType_
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
...
...
@@ -18,41 +18,36 @@ template <typename DataType_,
ck_tile
::
index_t
ThreadPerBlock_N_
,
// num threads along N
ck_tile
::
index_t
Vector_N_
,
// vector size along N
bool
kPadN_
,
bool
kSaveMeanInvStd_
,
bool
kTwoPass_
>
using
trait_
=
layernorm2d_fwd_traits_
<
DataType_
,
Repeat_M_
,
Repeat_N_
,
ThreadPerBlock_M_
,
ThreadPerBlock_N_
,
Vector_N_
,
kPadN_
,
kSaveMeanInvStd_
,
kTwoPass_
>
;
using
trait_
=
smoothquant_traits_
<
DataType_
,
Repeat_M_
,
Repeat_N_
,
ThreadPerBlock_M_
,
ThreadPerBlock_N_
,
Vector_N_
,
kPadN_
,
kTwoPass_
>
;
template
<
typename
Traits_
>
float
layernorm2d_fwd
_
(
const
S
&
s
,
A
a
)
float
smoothquant
_
(
const
S
&
s
,
A
a
)
{
using
DataType
=
typename
Traits_
::
DataType
;
using
PipelineProblem
=
ck_tile
::
Layernorm2dFwdPipelineProblem
<
typename
LayerNormTypeConfig
<
DataType
>::
XDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
GammaDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
BetaDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
ComputeDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
YDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
MeanDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
InvStdDataType
,
using
PipelineProblem
=
ck_tile
::
SmoothquantPipelineProblem
<
typename
SmoothquantTypeConfig
<
DataType
>::
XDataType
,
typename
SmoothquantTypeConfig
<
DataType
>::
XScaleDataType
,
typename
SmoothquantTypeConfig
<
DataType
>::
ComputeDataType
,
typename
SmoothquantTypeConfig
<
DataType
>::
YScaleDataType
,
typename
SmoothquantTypeConfig
<
DataType
>::
QYDataType
,
typename
Traits_
::
Shape
,
Traits_
::
kPadN
,
Traits_
::
kSaveMeanInvStd
,
Traits_
::
kTwoPass
>
;
using
OnePassPipeline
=
ck_tile
::
Layernorm2dFwd
PipelineOnePass
<
PipelineProblem
>
;
using
TwoPassPipeline
=
ck_tile
::
Layernorm2dFwd
PipelineTwoPass
<
PipelineProblem
>
;
using
OnePassPipeline
=
ck_tile
::
Smoothquant
PipelineOnePass
<
PipelineProblem
>
;
using
TwoPassPipeline
=
ck_tile
::
Smoothquant
PipelineTwoPass
<
PipelineProblem
>
;
using
Pipeline
=
std
::
conditional_t
<
Traits_
::
kTwoPass
,
TwoPassPipeline
,
OnePassPipeline
>
;
using
Kernel
=
ck_tile
::
Layernorm2dFwd
<
Pipeline
>
;
using
Kernel
=
ck_tile
::
Smoothquant
<
Pipeline
>
;
const
dim3
grids
=
Kernel
::
GridSize
(
a
);
constexpr
dim3
blocks
=
Kernel
::
BlockSize
();
...
...
example/ck_tile/12_smoothquant/script/perf_test.sh
0 → 100755
View file @
ead5167a
EXE
=
"
$(
find
.
-name
tile_smoothquant
-type
f |
head
-n
1
)
"
$EXE
-m
=
1
-n
=
1
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
80
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
128
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
144
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
168
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
184
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
256
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
288
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
344
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
376
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
448
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
512
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
924
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1024
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1078
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1996
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
4080
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
80
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
128
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
144
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
168
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
184
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
256
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
288
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
344
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
376
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
448
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
512
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
924
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1024
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1078
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1996
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
4080
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
\ No newline at end of file
example/ck_tile/12_smoothquant/script/smoke_test.sh
0 → 100755
View file @
ead5167a
#!/bin/sh
EXE
=
"
$(
find
.
-name
tile_smoothquant
-type
f |
head
-n
1
)
"
for
pr_i
in
"fp16"
"bf16"
;
do
$EXE
-prec
=
$pr_i
-m
=
99
-n
=
13
$EXE
-prec
=
$pr_i
-m
=
17
-n
=
16
$EXE
-prec
=
$pr_i
-m
=
1
-n
=
100
$EXE
-prec
=
$pr_i
-m
=
4
-n
=
128
$EXE
-prec
=
$pr_i
-m
=
80
-n
=
127
$EXE
-prec
=
$pr_i
-m
=
22
-n
=
255
-stride
=
256
$EXE
-prec
=
$pr_i
-m
=
7
-n
=
599
$EXE
-prec
=
$pr_i
-m
=
19
-n
=
512
$EXE
-prec
=
$pr_i
-m
=
33
-n
=
313
-stride
=
1000
$EXE
-prec
=
$pr_i
-m
=
11
-n
=
510
$EXE
-prec
=
$pr_i
-m
=
171
-n
=
676
-stride
=
818
$EXE
-prec
=
$pr_i
-m
=
91
-n
=
636
$EXE
-prec
=
$pr_i
-m
=
12
-n
=
768
-stride
=
800
$EXE
-prec
=
$pr_i
-m
=
100
-n
=
766
-stride
=
812
$EXE
-prec
=
$pr_i
-m
=
31
-n
=
1024
$EXE
-prec
=
$pr_i
-m
=
64
-n
=
1000
-stride
=
1004
$EXE
-prec
=
$pr_i
-m
=
8
-n
=
1501
$EXE
-prec
=
$pr_i
-m
=
3
-n
=
1826
$EXE
-prec
=
$pr_i
-m
=
5
-n
=
2040
$EXE
-prec
=
$pr_i
-m
=
7
-n
=
2734
$EXE
-prec
=
$pr_i
-m
=
1
-n
=
3182
$EXE
-prec
=
$pr_i
-m
=
9
-n
=
4096
$EXE
-prec
=
$pr_i
-m
=
3
-n
=
8192
$EXE
-prec
=
$pr_i
-m
=
1
-n
=
10547
$EXE
-prec
=
$pr_i
-m
=
3
-n
=
17134
done
example/ck_tile/12_smoothquant/smoothquant.cpp
0 → 100644
View file @
ead5167a
#include "ck_tile/host.hpp"
#include "smoothquant.hpp"
#include <cstring>
// different threshold for different dtype
template
<
typename
DataType
>
auto
get_elimit
()
{
double
rtol
=
1e-5
;
double
atol
=
1e-5
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
template
<
>
auto
get_elimit
<
ck_tile
::
bf16_t
>
()
{
double
rtol
=
1e-5
;
double
atol
=
1e-5
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
template
<
>
auto
get_elimit
<
ck_tile
::
int8_t
>
()
{
// due to rounding, int8 quantization might have 1 abs error
double
rtol
=
1
;
double
atol
=
1
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
auto
create_args
(
int
argc
,
char
*
argv
[])
{
ck_tile
::
ArgParser
arg_parser
;
arg_parser
.
insert
(
"m"
,
"3328"
,
"m dimension"
)
.
insert
(
"n"
,
"4096"
,
"n dimension"
)
.
insert
(
"stride"
,
"-1"
,
"stride per row, if -1 then equal to n"
)
.
insert
(
"v"
,
"1"
,
"cpu validation or not"
)
.
insert
(
"kname"
,
"1"
,
"print kernel name or not"
)
.
insert
(
"prec"
,
"fp16"
,
"precision"
)
.
insert
(
"warmup"
,
"5"
,
"cold iter"
)
.
insert
(
"repeat"
,
"20"
,
"hot iter"
);
bool
result
=
arg_parser
.
parse
(
argc
,
argv
);
return
std
::
make_tuple
(
result
,
arg_parser
);
}
template
<
typename
DataType
>
bool
run
(
const
ck_tile
::
ArgParser
&
arg_parser
)
{
ck_tile
::
index_t
m
=
arg_parser
.
get_int
(
"m"
);
ck_tile
::
index_t
n
=
arg_parser
.
get_int
(
"n"
);
ck_tile
::
index_t
stride
=
arg_parser
.
get_int
(
"stride"
);
if
(
stride
<
0
)
stride
=
n
;
std
::
string
data_type
=
arg_parser
.
get_str
(
"prec"
);
int
kname
=
arg_parser
.
get_int
(
"kname"
);
int
do_validation
=
arg_parser
.
get_int
(
"v"
);
int
warmup
=
arg_parser
.
get_int
(
"warmup"
);
int
repeat
=
arg_parser
.
get_int
(
"repeat"
);
assert
(
stride
>=
n
);
using
TypeConfig
=
SmoothquantTypeConfig
<
DataType
>
;
using
XDataType
=
typename
TypeConfig
::
XDataType
;
using
XScaleDataType
=
typename
TypeConfig
::
XScaleDataType
;
using
YScaleDataType
=
typename
TypeConfig
::
YScaleDataType
;
using
QYDataType
=
typename
TypeConfig
::
QYDataType
;
using
ComputeDataType
=
typename
TypeConfig
::
ComputeDataType
;
// host verify
ck_tile
::
HostTensor
<
XDataType
>
x_host
({
m
,
n
},
{
stride
,
1
});
ck_tile
::
HostTensor
<
XScaleDataType
>
xscale_host
({
n
});
ck_tile
::
HostTensor
<
YScaleDataType
>
yscale_host_ref
({
m
},
{
1
});
ck_tile
::
HostTensor
<
YScaleDataType
>
yscale_host_dev
({
m
},
{
1
});
ck_tile
::
HostTensor
<
QYDataType
>
qy_host_ref
({
m
,
n
},
{
stride
,
1
});
ck_tile
::
HostTensor
<
QYDataType
>
qy_host_dev
({
m
,
n
},
{
stride
,
1
});
ck_tile
::
FillUniformDistribution
<
XDataType
>
{
-
.5
f
,
.5
f
}(
x_host
);
ck_tile
::
FillUniformDistribution
<
XScaleDataType
>
{
1e-3
,
.5
f
}(
xscale_host
);
ck_tile
::
DeviceMem
x_buf
(
x_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
xscale_buf
(
xscale_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
yscale_buf
(
yscale_host_dev
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
qy_buf
(
qy_host_dev
.
get_element_space_size_in_bytes
());
x_buf
.
ToDevice
(
x_host
.
data
());
xscale_buf
.
ToDevice
(
xscale_host
.
data
());
std
::
cout
<<
"["
<<
data_type
<<
"]"
<<
" m:"
<<
m
<<
", n:"
<<
n
<<
", stride:"
<<
stride
<<
std
::
flush
;
smoothquant_traits
traits
{
data_type
};
smoothquant_args
args
{
x_buf
.
GetDeviceBuffer
(),
xscale_buf
.
GetDeviceBuffer
(),
yscale_buf
.
GetDeviceBuffer
(),
qy_buf
.
GetDeviceBuffer
(),
m
,
n
,
stride
};
float
ave_time
=
smoothquant
(
traits
,
args
,
ck_tile
::
stream_config
{
nullptr
,
true
,
kname
?
1
:
0
,
warmup
,
repeat
});
std
::
size_t
num_byte
=
sizeof
(
XDataType
)
*
m
*
n
+
sizeof
(
XScaleDataType
)
*
n
+
sizeof
(
YScaleDataType
)
*
m
+
sizeof
(
QYDataType
)
*
m
*
n
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
std
::
cout
<<
", "
<<
ave_time
*
1.E3
<<
" us, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
flush
;
bool
pass
=
true
;
if
(
do_validation
)
{
using
YDataType
=
ComputeDataType
;
ck_tile
::
HostTensor
<
ComputeDataType
>
y_host
({
m
,
n
},
{
stride
,
1
});
// smooth outlier
{
auto
f
=
[
&
](
auto
n_
)
{
auto
v_xscale
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
xscale_host
(
n_
));
for
(
int
m_
=
0
;
m_
<
m
;
++
m_
)
{
auto
v_x
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
x_host
(
m_
,
n_
));
y_host
(
m_
,
n_
)
=
v_x
*
v_xscale
;
}
};
ck_tile
::
make_ParallelTensorFunctor
(
f
,
xscale_host
.
get_element_space_size
())(
std
::
thread
::
hardware_concurrency
());
}
// yscale
{
ck_tile
::
HostTensor
<
YDataType
>
y_rowwise_amax_host
({
m
});
using
ReduceAmax
=
ck_tile
::
ReduceOp
::
AbsMax
;
ck_tile
::
reference_reduce
<
ComputeDataType
,
ComputeDataType
,
YDataType
>
(
y_host
,
y_rowwise_amax_host
,
ReduceAmax
{});
auto
op
=
[](
const
auto
&
v0
)
{
return
v0
/
ck_tile
::
type_convert
<
ComputeDataType
>
(
ck_tile
::
numeric
<
QYDataType
>::
max
());
};
ck_tile
::
reference_unary_elementwise
<
YDataType
,
YScaleDataType
,
ComputeDataType
>
(
y_rowwise_amax_host
,
yscale_host_ref
,
op
);
yscale_buf
.
FromDevice
(
yscale_host_dev
.
mData
.
data
());
auto
[
rtol
,
atol
]
=
get_elimit
<
YScaleDataType
>
();
pass
&=
ck_tile
::
check_err
(
yscale_host_dev
,
yscale_host_ref
,
std
::
string
(
"yscale Error: Incorrect results!"
),
rtol
,
atol
);
}
// rowwise quantization
{
ck_tile
::
reference_rowwise_quantization2d
<
YDataType
,
YScaleDataType
,
QYDataType
>
(
y_host
,
yscale_host_ref
,
qy_host_ref
);
qy_buf
.
FromDevice
(
qy_host_dev
.
data
());
auto
[
rtol
,
atol
]
=
get_elimit
<
QYDataType
>
();
if
(
stride
==
n
)
{
pass
=
ck_tile
::
check_err
(
qy_host_dev
,
qy_host_ref
,
std
::
string
(
"qy Error: Incorrect results!"
),
rtol
,
atol
);
}
else
{
for
(
int
i_r
=
0
;
i_r
<
m
;
i_r
++
)
{
std
::
vector
<
QYDataType
>
qy_host_dev_row
(
qy_host_dev
.
begin
()
+
i_r
*
stride
,
qy_host_dev
.
begin
()
+
i_r
*
stride
+
n
);
std
::
vector
<
QYDataType
>
qy_host_ref_row
(
qy_host_ref
.
begin
()
+
i_r
*
stride
,
qy_host_ref
.
begin
()
+
i_r
*
stride
+
n
);
pass
&=
ck_tile
::
check_err
(
qy_host_dev_row
,
qy_host_ref_row
,
std
::
string
(
"qy["
)
+
std
::
to_string
(
i_r
)
+
std
::
string
(
"] Error: Incorrect results!"
),
rtol
,
atol
);
}
}
}
std
::
cout
<<
", valid:"
<<
(
pass
?
"y"
:
"n"
)
<<
std
::
flush
<<
std
::
endl
;
}
return
pass
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
auto
[
result
,
arg_parser
]
=
create_args
(
argc
,
argv
);
if
(
!
result
)
return
-
1
;
const
std
::
string
data_type
=
arg_parser
.
get_str
(
"prec"
);
if
(
data_type
==
"fp16"
)
{
return
run
<
ck_tile
::
half_t
>
(
arg_parser
)
?
0
:
-
2
;
}
else
if
(
data_type
==
"bf16"
)
{
return
run
<
ck_tile
::
bf16_t
>
(
arg_parser
)
?
0
:
-
2
;
}
return
-
3
;
}
example/ck_tile/12_smoothquant/smoothquant.hpp
0 → 100644
View file @
ead5167a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/smoothquant.hpp"
#include <string>
template
<
typename
DataType
>
struct
SmoothquantTypeConfig
;
template
<
>
struct
SmoothquantTypeConfig
<
ck_tile
::
half_t
>
{
using
XDataType
=
ck_tile
::
half_t
;
using
XScaleDataType
=
float
;
using
YScaleDataType
=
float
;
using
QYDataType
=
ck_tile
::
int8_t
;
using
ComputeDataType
=
float
;
};
template
<
>
struct
SmoothquantTypeConfig
<
ck_tile
::
bf16_t
>
{
using
XDataType
=
ck_tile
::
bf16_t
;
using
XScaleDataType
=
float
;
using
YScaleDataType
=
float
;
using
QYDataType
=
ck_tile
::
int8_t
;
using
ComputeDataType
=
float
;
};
// runtime args
struct
smoothquant_args
:
public
ck_tile
::
SmoothquantHostArgs
{
};
// this is used to pattern-match internl kernel implementation, not to instantiate kernel
template
<
typename
DataType_
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
ck_tile
::
index_t
Repeat_N_
,
// each thread repeat along N
ck_tile
::
index_t
ThreadPerBlock_M_
,
// num threads along M
ck_tile
::
index_t
ThreadPerBlock_N_
,
// num threads along N
ck_tile
::
index_t
Vector_N_
,
// vector size along N
bool
kPadN_
,
bool
kTwoPass_
>
struct
smoothquant_traits_
{
using
DataType
=
ck_tile
::
remove_cvref_t
<
DataType_
>
;
static
constexpr
bool
is_warp_per_row
=
ThreadPerBlock_N_
<=
warpSize
;
static_assert
((
ThreadPerBlock_M_
*
ThreadPerBlock_N_
)
%
warpSize
==
0
);
static
constexpr
ck_tile
::
index_t
total_warps
=
(
ThreadPerBlock_M_
*
ThreadPerBlock_N_
)
/
warpSize
;
// num of warps along m
static
constexpr
ck_tile
::
index_t
BlockWarps_M
=
[]()
{
if
constexpr
(
is_warp_per_row
)
{
static_assert
(
warpSize
%
ThreadPerBlock_N_
==
0
);
return
total_warps
*
(
warpSize
/
ThreadPerBlock_N_
);
}
else
{
// static_assert(warpSize % ThreadPerBlock_M_ == 0);
return
total_warps
/
(
ThreadPerBlock_N_
/
warpSize
);
}
}();
// num of warps along n
static
constexpr
ck_tile
::
index_t
BlockWarps_N
=
[]()
{
if
constexpr
(
is_warp_per_row
)
{
static_assert
(
warpSize
%
ThreadPerBlock_N_
==
0
);
return
1
;
}
else
{
static_assert
(
ThreadPerBlock_N_
%
warpSize
==
0
);
return
ThreadPerBlock_N_
/
warpSize
;
}
}();
static
constexpr
ck_tile
::
index_t
Repeat_M
=
Repeat_M_
;
static
constexpr
ck_tile
::
index_t
Repeat_N
=
Repeat_N_
;
static
constexpr
ck_tile
::
index_t
Block_M
=
Repeat_M_
*
ThreadPerBlock_M_
;
static
constexpr
ck_tile
::
index_t
Block_N
=
Repeat_N_
*
ThreadPerBlock_N_
*
Vector_N_
;
static
constexpr
ck_tile
::
index_t
Warp_M
=
ThreadPerBlock_M_
/
BlockWarps_M
;
static
constexpr
ck_tile
::
index_t
Warp_N
=
ThreadPerBlock_N_
/
BlockWarps_N
*
Vector_N_
;
using
BlockTile
=
ck_tile
::
sequence
<
Block_M
,
Block_N
>
;
using
BlockWarps
=
ck_tile
::
sequence
<
BlockWarps_M
,
BlockWarps_N
>
;
using
WarpTile
=
ck_tile
::
sequence
<
Warp_M
,
Warp_N
>
;
using
Vector
=
ck_tile
::
sequence
<
1
,
Vector_N_
>
;
using
Shape
=
ck_tile
::
Generic2dBlockShape
<
BlockTile
,
BlockWarps
,
WarpTile
,
Vector
>
;
static
constexpr
bool
kPadN
=
kPadN_
;
static
constexpr
bool
kTwoPass
=
kTwoPass_
;
};
template
<
typename
Traits_
>
float
smoothquant_
(
const
ck_tile
::
stream_config
&
s
,
smoothquant_args
a
);
// This is the public API, will be generated by script
struct
smoothquant_traits
{
std
::
string
data_type
;
};
float
smoothquant
(
smoothquant_traits
,
smoothquant_args
,
const
ck_tile
::
stream_config
&
);
example/ck_tile/1
2
_moe_sorting/CMakeLists.txt
→
example/ck_tile/1
3
_moe_sorting/CMakeLists.txt
View file @
ead5167a
File moved
example/ck_tile/1
2
_moe_sorting/README.md
→
example/ck_tile/1
3
_moe_sorting/README.md
View file @
ead5167a
File moved
example/ck_tile/1
2
_moe_sorting/moe_sorting.cpp
→
example/ck_tile/1
3
_moe_sorting/moe_sorting.cpp
View file @
ead5167a
File moved
example/ck_tile/1
2
_moe_sorting/moe_sorting_api.cpp
→
example/ck_tile/1
3
_moe_sorting/moe_sorting_api.cpp
View file @
ead5167a
File moved
example/ck_tile/1
2
_moe_sorting/moe_sorting_api.hpp
→
example/ck_tile/1
3
_moe_sorting/moe_sorting_api.hpp
View file @
ead5167a
File moved
example/ck_tile/1
2
_moe_sorting/script/smoke_test.sh
→
example/ck_tile/1
3
_moe_sorting/script/smoke_test.sh
View file @
ead5167a
File moved
example/ck_tile/CMakeLists.txt
View file @
ead5167a
...
...
@@ -11,4 +11,5 @@ add_subdirectory(06_permute)
add_subdirectory
(
09_topk_softmax
)
add_subdirectory
(
10_rmsnorm2d
)
add_subdirectory
(
11_add_rmsnorm2d_rdquant
)
add_subdirectory
(
12_moe_sorting
)
add_subdirectory
(
12_smoothquant
)
add_subdirectory
(
13_moe_sorting
)
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
ead5167a
...
...
@@ -13,15 +13,17 @@ namespace ck {
namespace
tensor_operation
{
namespace
element_wise
{
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
struct
UnaryOpBase
{
public:
__host__
__device__
virtual
~
UnaryOpBase
()
=
default
;
__host__
__device__
~
UnaryOpBase
()
=
default
;
__host__
__device__
UnaryOpBase
()
=
default
;
__host__
__device__
UnaryOpBase
(
const
UnaryOpBase
&
)
=
default
;
__host__
__device__
constexpr
UnaryOpBase
()
=
default
;
__host__
__device__
constexpr
UnaryOpBase
(
const
UnaryOpBase
&
)
=
default
;
__host__
__device__
constexpr
UnaryOpBase
(
UnaryOpBase
&&
)
=
default
;
__host__
__device__
UnaryOpBase
&
operator
=
(
const
UnaryOpBase
&
)
=
default
;
__host__
__device__
UnaryOpBase
(
UnaryOpBase
&&
)
=
default
;
__host__
__device__
UnaryOpBase
&
operator
=
(
UnaryOpBase
&&
)
=
default
;
__host__
__device__
virtual
inline
void
operator
()(
float
&
y
,
const
float
&
x
)
const
=
0
;
...
...
@@ -50,8 +52,14 @@ struct PassThroughPack2
constexpr
const
static
bool
is_pack2_invocable
=
true
;
};
struct
PassThrough
:
public
UnaryOpBase
struct
PassThrough
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
PassThrough
()
=
default
;
__host__
__device__
constexpr
PassThrough
(
const
PassThrough
&
)
=
default
;
__host__
__device__
constexpr
PassThrough
(
PassThrough
&&
)
=
default
;
__host__
__device__
PassThrough
&
operator
=
(
const
PassThrough
&
)
=
default
;
__host__
__device__
PassThrough
&
operator
=
(
PassThrough
&&
)
=
default
;
__host__
__device__
~
PassThrough
()
=
default
;
__host__
__device__
inline
void
operator
()(
float
&
y
,
const
float
&
x
)
const
final
{
y
=
x
;
}
...
...
@@ -409,8 +417,15 @@ struct UnarySquare
};
};
struct
UnaryAbs
:
public
UnaryOpBase
struct
UnaryAbs
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
UnaryAbs
()
=
default
;
__host__
__device__
constexpr
UnaryAbs
(
const
UnaryAbs
&
)
=
default
;
__host__
__device__
constexpr
UnaryAbs
(
UnaryAbs
&&
)
=
default
;
__host__
__device__
UnaryAbs
&
operator
=
(
const
UnaryAbs
&
)
=
default
;
__host__
__device__
UnaryAbs
&
operator
=
(
UnaryAbs
&&
)
=
default
;
__host__
__device__
~
UnaryAbs
()
=
default
;
__host__
__device__
inline
void
operator
()(
float
&
y
,
const
float
&
x
)
const
final
{
y
=
ck
::
math
::
abs
(
x
);
...
...
@@ -459,8 +474,15 @@ struct UnarySqrt
};
};
struct
Relu
:
public
UnaryOpBase
struct
Relu
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
Relu
()
=
default
;
__host__
__device__
constexpr
Relu
(
const
Relu
&
)
=
default
;
__host__
__device__
constexpr
Relu
(
Relu
&&
)
=
default
;
__host__
__device__
Relu
&
operator
=
(
const
Relu
&
)
=
default
;
__host__
__device__
Relu
&
operator
=
(
Relu
&&
)
=
default
;
__host__
__device__
~
Relu
()
=
default
;
__host__
__device__
inline
void
operator
()(
float
&
y
,
const
float
&
x
)
const
final
{
y
=
x
>
0
?
x
:
0
;
...
...
@@ -633,8 +655,14 @@ struct Gelu
}
};
struct
Sigmoid
:
public
UnaryOpBase
struct
Sigmoid
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
Sigmoid
()
=
default
;
__host__
__device__
constexpr
Sigmoid
(
const
Sigmoid
&
)
=
default
;
__host__
__device__
constexpr
Sigmoid
(
Sigmoid
&&
)
=
default
;
__host__
__device__
Sigmoid
&
operator
=
(
const
Sigmoid
&
)
=
default
;
__host__
__device__
Sigmoid
&
operator
=
(
Sigmoid
&&
)
=
default
;
__host__
__device__
~
Sigmoid
()
=
default
;
__host__
__device__
inline
void
operator
()(
float
&
y
,
const
float
&
x
)
const
final
{
...
...
@@ -688,8 +716,15 @@ struct Silu
};
};
struct
TanH
:
public
UnaryOpBase
struct
TanH
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
TanH
()
=
default
;
__host__
__device__
constexpr
TanH
(
const
TanH
&
)
=
default
;
__host__
__device__
constexpr
TanH
(
TanH
&&
)
=
default
;
__host__
__device__
TanH
&
operator
=
(
const
TanH
&
)
=
default
;
__host__
__device__
TanH
&
operator
=
(
TanH
&&
)
=
default
;
__host__
__device__
~
TanH
()
=
default
;
__host__
__device__
inline
void
operator
()(
float
&
y
,
const
float
&
x
)
const
final
{
y
=
ck
::
math
::
tanh
(
x
);
...
...
@@ -959,8 +994,12 @@ struct Rcp
};
};
struct
Swish
:
public
UnaryOpBase
struct
Swish
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
Swish
(
const
Swish
&
)
=
default
;
__host__
__device__
constexpr
Swish
(
Swish
&&
)
=
default
;
__host__
__device__
~
Swish
()
=
default
;
__host__
__device__
Swish
(
float
beta
=
1.0
f
)
:
beta_
(
beta
)
{}
__host__
__device__
float
get_beta
()
const
{
return
beta_
;
}
...
...
@@ -1019,8 +1058,12 @@ struct Swish : public UnaryOpBase
}
};
struct
SoftRelu
:
public
UnaryOpBase
struct
SoftRelu
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
SoftRelu
(
const
SoftRelu
&
)
=
default
;
__host__
__device__
constexpr
SoftRelu
(
SoftRelu
&&
)
=
default
;
__host__
__device__
~
SoftRelu
()
=
default
;
__host__
__device__
SoftRelu
(
float
alpha
=
1.0
f
)
:
alpha_
(
alpha
)
{}
__host__
__device__
float
get_alpha
()
const
{
return
alpha_
;
}
...
...
@@ -1070,8 +1113,12 @@ struct SoftRelu : public UnaryOpBase
}
};
struct
Power
:
public
UnaryOpBase
struct
Power
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
Power
(
const
Power
&
)
=
default
;
__host__
__device__
constexpr
Power
(
Power
&&
)
=
default
;
__host__
__device__
~
Power
()
=
default
;
__host__
__device__
Power
(
float
alpha
=
0.
f
,
float
beta
=
1.
f
,
float
gamma
=
2.
f
)
:
alpha_
(
alpha
),
beta_
(
beta
),
gamma_
(
gamma
)
{
...
...
@@ -1148,8 +1195,12 @@ struct Power : public UnaryOpBase
}
};
struct
ClippedRelu
:
public
UnaryOpBase
struct
ClippedRelu
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
ClippedRelu
(
const
ClippedRelu
&
)
=
default
;
__host__
__device__
constexpr
ClippedRelu
(
ClippedRelu
&&
)
=
default
;
__host__
__device__
~
ClippedRelu
()
=
default
;
__host__
__device__
ClippedRelu
(
float
alpha
=
0.
f
,
float
beta
=
1.
f
)
:
alpha_
(
alpha
),
beta_
(
beta
)
{
...
...
@@ -1205,8 +1256,11 @@ struct ClippedRelu : public UnaryOpBase
}
};
struct
LeakyRelu
:
public
UnaryOpBase
struct
LeakyRelu
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
LeakyRelu
(
const
LeakyRelu
&
)
=
default
;
__host__
__device__
constexpr
LeakyRelu
(
LeakyRelu
&&
)
=
default
;
__host__
__device__
~
LeakyRelu
()
=
default
;
__host__
__device__
LeakyRelu
(
float
alpha
=
0.
f
)
:
alpha_
(
alpha
)
{}
...
...
@@ -1250,8 +1304,11 @@ struct LeakyRelu : public UnaryOpBase
}
};
struct
Elu
:
public
UnaryOpBase
struct
Elu
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
Elu
(
const
Elu
&
)
=
default
;
__host__
__device__
constexpr
Elu
(
Elu
&&
)
=
default
;
__host__
__device__
~
Elu
()
=
default
;
__host__
__device__
Elu
(
float
alpha
=
1.
f
)
:
alpha_
(
alpha
)
{}
...
...
@@ -1296,8 +1353,11 @@ struct Elu : public UnaryOpBase
}
};
struct
Logistic
:
public
UnaryOpBase
struct
Logistic
final
:
public
UnaryOpBase
{
__host__
__device__
constexpr
Logistic
(
const
Logistic
&
)
=
default
;
__host__
__device__
constexpr
Logistic
(
Logistic
&&
)
=
default
;
__host__
__device__
~
Logistic
()
=
default
;
__host__
__device__
Logistic
(
float
alpha
=
1.0
f
)
:
alpha_
(
alpha
)
{}
...
...
@@ -1631,8 +1691,23 @@ struct DynamicUnaryOp
__host__
__device__
~
DynamicUnaryOp
()
{
if
(
unary_op_ptr_
)
delete
unary_op_ptr_
;
switch
(
unary_op_type_
)
{
case
(
UnaryOpType
::
Swish
):
delete
static_cast
<
Swish
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
Sigmoid
):
delete
static_cast
<
Sigmoid
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
PassThrough
):
delete
static_cast
<
PassThrough
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
Logistic
):
delete
static_cast
<
Logistic
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
TanH
):
delete
static_cast
<
TanH
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
Relu
):
delete
static_cast
<
Relu
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
SoftRelu
):
delete
static_cast
<
SoftRelu
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
UnaryAbs
):
delete
static_cast
<
UnaryAbs
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
Power
):
delete
static_cast
<
Power
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
ClippedRelu
):
delete
static_cast
<
ClippedRelu
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
LeakyRelu
):
delete
static_cast
<
LeakyRelu
*>
(
unary_op_ptr_
);
break
;
case
(
UnaryOpType
::
Elu
):
delete
static_cast
<
Elu
*>
(
unary_op_ptr_
);
break
;
default:
break
;
}
}
__device__
void
InitUnaryOpPtrOnDevice
()
...
...
@@ -1721,6 +1796,7 @@ struct DynamicUnaryOp
float
beta
;
float
gamma
;
};
#pragma clang diagnostic pop
}
// namespace element_wise
}
// namespace tensor_operation
...
...
include/ck_tile/core.hpp
View file @
ead5167a
...
...
@@ -25,6 +25,7 @@
#include "ck_tile/core/numeric/bfloat16.hpp"
#include "ck_tile/core/numeric/float8.hpp"
#include "ck_tile/core/numeric/half.hpp"
#include "ck_tile/core/numeric/int8.hpp"
#include "ck_tile/core/numeric/integer.hpp"
#include "ck_tile/core/numeric/integral_constant.hpp"
#include "ck_tile/core/numeric/math.hpp"
...
...
include/ck_tile/core/numeric/int8.hpp
0 → 100644
View file @
ead5167a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck_tile/core/config.hpp"
#include "ck_tile/core/numeric/half.hpp"
#include "ck_tile/core/numeric/integral_constant.hpp"
#include "ck_tile/core/numeric/math.hpp"
#include "ck_tile/core/numeric/numeric.hpp"
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/random.hpp"
#include <stdint.h>
#include <type_traits>
#pragma once
namespace
ck_tile
{
// use int8_t directly for int8 arithemetic
// here one can use ck_tile::int8_t to access original int8_t
using
int8_t
=
int8_t
;
// limits
template
<
class
T
>
struct
numeric
;
template
<
>
struct
numeric
<
int8_t
>
{
// minimum finite value, or minimum positive normalized value for float
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
min
()
{
return
int8_t
(
-
128
);
}
// minumum finite value
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
lowest
()
{
return
int8_t
(
-
128
);
}
// maximum finite value
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
max
()
{
return
int8_t
(
127
);
}
// difference between 1.0 and next value representable by float
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
epsilon
()
{
return
1
;
// not used
}
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
round_error
()
{
return
1
;
// not used
}
// positive infinity value
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
infinity
()
{
return
1
;
// not used
}
// quiet NaN
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
quiet_NaN
()
{
return
1
;
// not used
}
// signaling NaN
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
signaling_NaN
()
{
return
1
;
// not used
}
// smallest positive subnormal value
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
denorm_min
()
{
return
1
;
// not used
}
CK_TILE_HOST_DEVICE
static
constexpr
int8_t
zero
()
{
return
0
;
}
};
#if 0
template <typename T>
struct numeric_traits;
template <>
struct numeric_traits<int8_t>
{
static constexpr int exp = 5;
static constexpr int mant = 10;
static constexpr int bias = 15;
static constexpr uint16_t nan_mask = 0x7C00;
static constexpr uint16_t head_mask = 0xFC00;
static constexpr uint16_t mant_mask = 0x3FF;
static constexpr uint16_t exp_mask = 0x1F;
static constexpr uint32_t Inf = 0x7C00;
static constexpr uint32_t NegInf = 0xFC00;
static constexpr uint32_t NaN = 0x7C01;
static constexpr uint32_t Neg0 = 0x8000;
using bitwise_type = uint16_t;
};
#endif
CK_TILE_HOST_DEVICE
constexpr
float
int8_to_float
(
const
int8_t
&
x
)
{
return
static_cast
<
float
>
(
x
);
}
CK_TILE_HOST_DEVICE
constexpr
int8_t
float_to_int8
(
const
float
&
x
)
{
return
static_cast
<
int8_t
>
(
x
);
}
}
// namespace ck_tile
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment