Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2b27d5fc
Commit
2b27d5fc
authored
Jul 01, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into rosenrodt/gemm-layernorm
parents
f689a155
fa9a0a5c
Changes
137
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
594 additions
and
634 deletions
+594
-634
profiler/src/profile_gemm.cpp
profiler/src/profile_gemm.cpp
+87
-317
profiler/src/profile_gemm_add_add_fastgelu.cpp
profiler/src/profile_gemm_add_add_fastgelu.cpp
+14
-16
profiler/src/profile_gemm_splitk.cpp
profiler/src/profile_gemm_splitk.cpp
+148
-0
profiler/src/profile_normalization.cpp
profiler/src/profile_normalization.cpp
+134
-0
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+34
-26
test/batched_gemm/batched_gemm_util.hpp
test/batched_gemm/batched_gemm_util.hpp
+0
-109
test/gemm/gemm_util.hpp
test/gemm/gemm_util.hpp
+5
-116
test/gemm/gemm_xdl_bf16.cpp
test/gemm/gemm_xdl_bf16.cpp
+49
-28
test/gemm/gemm_xdl_fp16.cpp
test/gemm/gemm_xdl_fp16.cpp
+10
-0
test/gemm/gemm_xdl_fp32.cpp
test/gemm/gemm_xdl_fp32.cpp
+10
-0
test/gemm_split_k/CMakeLists.txt
test/gemm_split_k/CMakeLists.txt
+1
-1
test/gemm_split_k/gemm_split_k.cpp
test/gemm_split_k/gemm_split_k.cpp
+13
-10
test/softmax/CMakeLists.txt
test/softmax/CMakeLists.txt
+4
-1
test/softmax/test_softmax_fp16.cpp
test/softmax/test_softmax_fp16.cpp
+6
-1
test/softmax/test_softmax_fp32.cpp
test/softmax/test_softmax_fp32.cpp
+6
-1
test/softmax/test_softmax_int8.cpp
test/softmax/test_softmax_int8.cpp
+30
-0
test/softmax/test_softmax_util.hpp
test/softmax/test_softmax_util.hpp
+43
-8
No files found.
profiler/src/profile_gemm.cpp
View file @
2b27d5fc
...
...
@@ -14,10 +14,6 @@ enum struct GemmMatrixLayout
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
MK_KN_NM
,
// 4
MK_NK_NM
,
// 5
KM_KN_NM
,
// 6
KM_NK_NM
,
// 7
};
enum
struct
GemmDataType
...
...
@@ -30,7 +26,7 @@ enum struct GemmDataType
int
profile_gemm
(
int
argc
,
char
*
argv
[])
{
if
(
!
(
argc
=
=
14
||
argc
==
15
)
)
if
(
argc
!
=
14
)
{
printf
(
"arg1: tensor operation (gemm: GEMM)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)
\n
"
);
...
...
@@ -41,9 +37,8 @@ int profile_gemm(int argc, char* argv[])
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg6: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: time kernel (0=n
0
, 1=yes)
\n
"
);
printf
(
"arg7: time kernel (0=n
o
, 1=yes)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
}
...
...
@@ -61,350 +56,125 @@ int profile_gemm(int argc, char* argv[])
const
int
StrideA
=
std
::
stoi
(
argv
[
11
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
12
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
13
]);
int
KBatch
=
1
;
if
(
argc
==
15
)
KBatch
=
std
::
stoi
(
argv
[
14
]);
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
using
INT8
=
int8_t
;
using
INT32
=
int32_t
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
auto
profile
=
[
&
](
auto
a_type
,
auto
b_type
,
auto
acc_type
,
auto
c_type
,
auto
a_layout
,
auto
b_layout
,
auto
c_layout
)
{
using
ADataType
=
decltype
(
a_type
);
using
BDataType
=
decltype
(
b_type
);
using
AccDataType
=
decltype
(
acc_type
);
using
CDataType
=
decltype
(
c_type
);
using
ALayout
=
decltype
(
a_layout
);
using
BLayout
=
decltype
(
b_layout
);
using
CLayout
=
decltype
(
c_layout
);
const
int
DefaultStrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
DefaultStrideB
=
ck
::
is_same_v
<
BLayout
,
Row
>
?
N
:
K
;
const
int
DefaultStrideC
=
ck
::
is_same_v
<
CLayout
,
Row
>
?
N
:
M
;
bool
pass
=
ck
::
profiler
::
profile_gemm_impl
<
ADataType
,
BDataType
,
AccDataType
,
CDataType
,
ALayout
,
BLayout
,
CLayout
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
DefaultStrideA
:
StrideA
,
(
StrideB
<
0
)
?
DefaultStrideB
:
StrideB
,
(
StrideC
<
0
)
?
DefaultStrideC
:
StrideC
);
return
pass
?
0
:
1
;
};
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F
16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
F
32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Row
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F
16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
F
32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F
16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
F
32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Col
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F
32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
F
16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F
32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
F
16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
Row
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F
32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
F
16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F
32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
F
16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
Col
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
BF16
{},
BF16
{},
F32
{},
BF16
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
BF16
{},
BF16
{},
F32
{},
BF16
{},
Row
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
BF16
{},
BF16
{},
F32
{},
BF16
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
BF16
{},
BF16
{},
F32
{},
BF16
{},
Col
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
INT8
{},
INT8
{},
INT32
{},
INT8
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
INT8
{},
INT8
{},
INT32
{},
INT8
{},
Row
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
INT8
{},
INT8
{},
INT32
{},
INT8
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
,
KBatch
);
return
profile
(
INT8
{},
INT8
{},
INT32
{},
INT8
{},
Col
{},
Col
{},
Row
{});
}
else
{
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
0
;
return
1
;
}
}
profiler/src/profile_gemm_add_add_fastgelu.cpp
View file @
2b27d5fc
...
...
@@ -16,10 +16,6 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
MK_NK_MN_MN_MN
,
// 1
KM_KN_MN_MN_MN
,
// 2
KM_NK_MN_MN_MN
,
// 3
MK_KN_NM_MN_MN
,
// 4
MK_NK_NM_MN_MN
,
// 5
KM_KN_NM_MN_MN
,
// 6
KM_NK_NM_MN_MN
,
// 7
};
enum
struct
MatrixDataType
...
...
@@ -101,17 +97,17 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
const
int
DefaultStrideD1
=
ck
::
is_same_v
<
D1Layout
,
Row
>
?
N
:
M
;
const
int
DefaultStrideE
=
ck
::
is_same_v
<
ELayout
,
Row
>
?
N
:
M
;
return
ck
::
profiler
::
profile_gemm_add_add_fastgelu_impl
<
ADataType
,
BDataType
,
AccDataType
,
D0DataType
,
D1DataType
,
EDataType
,
ALayout
,
BLayout
,
D0Layout
,
D1Layout
,
ELayout
>
(
bool
pass
=
ck
::
profiler
::
profile_gemm_add_add_fastgelu_impl
<
ADataType
,
BDataType
,
AccDataType
,
D0DataType
,
D1DataType
,
EDataType
,
ALayout
,
BLayout
,
D0Layout
,
D1Layout
,
ELayout
>
(
do_verification
,
init_method
,
do_log
,
...
...
@@ -124,6 +120,8 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
(
StrideD0
<
0
)
?
DefaultStrideD0
:
StrideD0
,
(
StrideD1
<
0
)
?
DefaultStrideD1
:
StrideD1
,
(
StrideE
<
0
)
?
DefaultStrideE
:
StrideE
);
return
pass
?
0
:
1
;
};
if
(
data_type
==
MatrixDataType
::
F16_F16_F16_F16_F16
&&
layout
==
MatrixLayout
::
MK_KN_MN_MN_MN
)
...
...
@@ -149,6 +147,6 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
0
;
return
1
;
}
}
profiler/src/profile_gemm_splitk.cpp
0 → 100644
View file @
2b27d5fc
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_gemm_splitk_impl.hpp"
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
};
enum
struct
GemmDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
};
int
profile_gemm_splitk
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
15
)
{
printf
(
"arg1: tensor operation (gemm_splitk: Split-K GEMM)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg6: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
K
=
std
::
stoi
(
argv
[
10
]);
const
int
StrideA
=
std
::
stoi
(
argv
[
11
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
12
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
13
]);
const
int
KBatch
=
std
::
stoi
(
argv
[
14
]);
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
auto
profile
=
[
&
](
auto
a_type
,
auto
b_type
,
auto
acc_type
,
auto
c_type
,
auto
a_layout
,
auto
b_layout
,
auto
c_layout
)
{
using
ADataType
=
decltype
(
a_type
);
using
BDataType
=
decltype
(
b_type
);
using
AccDataType
=
decltype
(
acc_type
);
using
CDataType
=
decltype
(
c_type
);
using
ALayout
=
decltype
(
a_layout
);
using
BLayout
=
decltype
(
b_layout
);
using
CLayout
=
decltype
(
c_layout
);
const
int
DefaultStrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
DefaultStrideB
=
ck
::
is_same_v
<
BLayout
,
Row
>
?
N
:
K
;
const
int
DefaultStrideC
=
ck
::
is_same_v
<
CLayout
,
Row
>
?
N
:
M
;
bool
pass
=
ck
::
profiler
::
profile_gemm_splitk_impl
<
ADataType
,
BDataType
,
AccDataType
,
CDataType
,
ALayout
,
BLayout
,
CLayout
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
DefaultStrideA
:
StrideA
,
(
StrideB
<
0
)
?
DefaultStrideB
:
StrideB
,
(
StrideC
<
0
)
?
DefaultStrideC
:
StrideC
,
KBatch
);
return
pass
?
0
:
1
;
};
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Row
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Col
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
Row
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
Col
{},
Col
{},
Row
{});
}
else
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
}
profiler/src/profile_normalization.cpp
0 → 100644
View file @
2b27d5fc
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/include/profile_normalization_impl.hpp"
using
ck
::
index_t
;
using
ck
::
profiler
::
NormDataType
;
using
ck
::
profiler
::
NormType
;
struct
ArgParser
{
std
::
unordered_map
<
std
::
string
,
NormType
>
norm_dict
=
{{
"layernorm"
,
NormType
::
LAYERNORM
},
{
"batchnorm"
,
NormType
::
BATCHNORM
},
{
"softmax"
,
NormType
::
SOFTMAX
}};
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
long_opts
=
{
{
"length"
,
{}},
{
"stride"
,
{}},
{
"reduce"
,
{}},
{
"alpha"
,
{}},
{
"beta"
,
{}}};
bool
parse_opt
(
int
argc
,
char
*
argv
[],
const
std
::
string
&
key
,
int
i
)
{
if
(
std
::
string
(
"--"
)
+
key
==
argv
[
i
])
{
int
pos
=
i
;
while
(
++
i
<
argc
&&
argv
[
i
][
0
]
!=
'-'
)
{}
int
end
=
i
;
for
(
int
j
=
pos
+
1
;
j
<
end
;
j
++
)
{
long_opts
[
key
].
push_back
(
std
::
stoi
(
argv
[
j
]));
}
return
true
;
}
return
false
;
}
void
operator
()(
int
argc
,
char
*
argv
[])
{
for
(
auto
&
kv
:
long_opts
)
{
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
if
(
parse_opt
(
argc
,
argv
,
kv
.
first
,
i
))
break
;
}
}
}
};
void
print_help
()
{
std
::
cout
<<
"arg1: tensor operation (layernorm/batchnorm/softmax)
\n
"
<<
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)
\n
"
<<
"arg3: verification (0: no; 1: yes)
\n
"
<<
"arg4: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg5: print tensor value (0: no; 1: yes)
\n
"
<<
"arg6: time kernel (0=n0, 1=yes)
\n
"
<<
"--length: tensor extents (e.g, --length 8 4 256)
\n
"
<<
"--stride: tensor strides (e.g, --stride 1024 256 1)
\n
"
<<
"--reduce: to-reduce dimensions (e.g, --reduce 2)
\n
"
<<
"--alpha: alpha scaling value
\n
"
<<
"--beta: beta scaling value
\n
"
<<
std
::
endl
;
}
int
profile_normalization
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<=
2
)
{
print_help
();
return
0
;
}
ArgParser
arg_parser
;
// short unnamed options
const
NormType
norm_type
=
arg_parser
.
norm_dict
[
argv
[
1
]];
const
NormDataType
data_type
=
static_cast
<
NormDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
6
]);
// parse the long options
arg_parser
(
argc
,
argv
);
const
std
::
vector
<
index_t
>
length
=
arg_parser
.
long_opts
[
"length"
];
const
std
::
vector
<
index_t
>
stride
=
arg_parser
.
long_opts
[
"stride"
];
const
std
::
vector
<
index_t
>
reduce
=
arg_parser
.
long_opts
[
"reduce"
];
const
index_t
alpha
=
arg_parser
.
long_opts
[
"alpha"
].
empty
()
?
1
:
arg_parser
.
long_opts
[
"alpha"
][
0
];
const
index_t
beta
=
arg_parser
.
long_opts
[
"beta"
].
empty
()
?
0
:
arg_parser
.
long_opts
[
"beta"
][
0
];
if
(
data_type
==
NormDataType
::
F16_F16
)
{
ck
::
profiler
::
profile_normalization_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
float
(
alpha
),
float
(
beta
),
norm_type
);
}
else
if
(
data_type
==
NormDataType
::
F32_F32
)
{
ck
::
profiler
::
profile_normalization_impl
<
float
,
float
,
float
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
,
stride
,
reduce
,
float
(
alpha
),
float
(
beta
),
norm_type
);
}
else
{
throw
std
::
runtime_error
(
"not implemented yet"
);
}
return
0
;
}
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_normalization(argc, argv);
// return 0;
// }
profiler/src/profiler.cpp
View file @
2b27d5fc
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <cstring>
#include "profiler/include/profile_convnd_fwd.hpp"
int
profile_gemm
(
int
,
char
*
[]);
int
profile_gemm_splitk
(
int
,
char
*
[]);
int
profile_gemm_bias_2d
(
int
,
char
*
[]);
int
profile_gemm_bias_relu
(
int
,
char
*
[]);
int
profile_gemm_bias_relu_add
(
int
,
char
*
[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_gemm_bias_add_reduce
(
int
,
char
*
[]);
int
profile_gemm_add_add_fastgelu
(
int
,
char
*
[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_batched_gemm_reduce
(
int
,
char
*
[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
int
profile_convnd_fwd
(
int
argc
,
char
*
argv
[]);
int
profile_convnd_bwd_data
(
int
,
char
*
[],
int
);
int
profile_reduce
(
int
,
char
*
[]);
int
profile_conv_bwd_weight
(
int
,
char
*
[]);
int
profile_
batched_gemm_reduce
(
int
,
char
*
[]);
int
profile_
gemm_add_add_fastgelu
(
int
,
char
*
[]);
int
profile_
normalization
(
int
,
char
*
[]);
int
profile_
reduce
(
int
,
char
*
[]);
static
void
print_helper_message
()
{
// clang-format off
printf
(
"arg1: tensor operation (gemm: GEMM
\n
"
" gemm_bias_2d: GEMM+Bias(2D)
\n
"
" gemm_bias_relu: GEMM+Bias+ReLU
\n
"
" gemm_bias_relu_add: GEMM+Bias+ReLU+Add
\n
"
" gemm_reduce: GEMM+Reduce
\n
"
" grouped_gemm: Grouped GEMM
\n
"
" conv_fwd: ForwardConvolution
\n
"
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU
\n
"
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add
\n
"
" conv1d_bwd_data: BackwardConvolution data 1 dim
\n
"
" conv2d_bwd_data: BackwardConvolution data 2 dim
\n
"
" conv3d_bwd_data: BackwardConvolution data 3 dim
\n
"
" reduce: Reduce
\n
"
" conv2d_bwd_weight: Backward Weight Convolution 2d
\n
"
" gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU
\n
"
);
printf
(
"arg1: tensor operation (gemm: GEMM
\n
"
" gemm_splitk: Split-K GEMM
\n
"
" gemm_bias_2d: GEMM+Bias(2D)
\n
"
" gemm_bias_relu: GEMM+Bias+ReLU
\n
"
" gemm_bias_relu_add: GEMM+Bias+ReLU+Add
\n
"
" gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU
\n
"
" gemm_reduce: GEMM+Reduce
\n
"
" batched_gemm: Batched GEMM
\n
"
" grouped_gemm: Grouped GEMM
\n
"
" conv_fwd: ForwardConvolution
\n
"
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU
\n
"
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add
\n
"
" conv1d_bwd_data: BackwardConvolution data 1 dim
\n
"
" conv2d_bwd_data: BackwardConvolution data 2 dim
\n
"
" conv3d_bwd_data: BackwardConvolution data 3 dim
\n
"
" conv2d_bwd_weight: Backward Weight Convolution 2d
\n
"
" reduce: Reduce
\n
"
);
// clang-format on
}
...
...
@@ -60,6 +59,10 @@ int main(int argc, char* argv[])
{
return
profile_gemm
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"gemm_splitk"
)
==
0
)
{
return
profile_gemm_splitk
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"gemm_bias_2d"
)
==
0
)
{
return
profile_gemm_bias_2d
(
argc
,
argv
);
...
...
@@ -94,7 +97,7 @@ int main(int argc, char* argv[])
}
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
{
return
ck
::
profiler
::
profile_convnd_fwd
(
argc
,
argv
);
return
profile_convnd_fwd
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd_bias_relu"
)
==
0
)
{
...
...
@@ -128,6 +131,11 @@ int main(int argc, char* argv[])
{
return
profile_gemm_add_add_fastgelu
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"batchnorm"
)
==
0
||
strcmp
(
argv
[
1
],
"layernorm"
)
==
0
||
strcmp
(
argv
[
1
],
"softmax"
)
==
0
)
{
return
profile_normalization
(
argc
,
argv
);
}
else
{
print_helper_message
();
...
...
test/batched_gemm/batched_gemm_util.hpp
deleted
100644 → 0
View file @
f689a155
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef BATCHED_GEMM_UTILS_HPP
#define BATCHED_GEMM_UTILS_HPP
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
namespace
ck
{
namespace
batched_gemm_util
{
struct
GemmParams
{
GemmParams
()
:
M
(
1024
),
N
(
1024
),
K
(
1024
),
StrideA
(
1024
),
StrideB
(
1024
),
StrideC
(
1024
),
alpha
(
1
),
beta
(
0
)
{
}
ck
::
index_t
M
;
ck
::
index_t
N
;
ck
::
index_t
K
;
ck
::
index_t
StrideA
;
ck
::
index_t
StrideB
;
ck
::
index_t
StrideC
;
float
alpha
;
float
beta
;
};
template
<
typename
BatchedGemmInstance
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
void
RunHostBatchedGemm
(
const
Tensor
<
ADataType
>&
A
,
const
Tensor
<
BDataType
>&
B
,
Tensor
<
CDataType
>&
C
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
{
auto
ref_batched_gemm
=
BatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_batched_gemm
.
MakeArgument
(
A
,
B
,
C
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
}
template
<
typename
DeviceGemmPtr
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
void
RunDeviceBatchedGemm
(
DeviceGemmPtr
&
batched_gemm_ptr
,
const
ck
::
batched_gemm_util
::
GemmParams
&
params
,
const
Tensor
<
ADataType
>&
A
,
const
Tensor
<
BDataType
>&
B
,
Tensor
<
CDataType
>&
C
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
{
DeviceMem
a_g_m_k_device_buf
(
sizeof
(
ADataType
)
*
A
.
mDesc
.
GetElementSpace
());
DeviceMem
b_g_k_n_device_buf
(
sizeof
(
BDataType
)
*
B
.
mDesc
.
GetElementSpace
());
DeviceMem
c_g_m_n_device_buf
(
sizeof
(
CDataType
)
*
C
.
mDesc
.
GetElementSpace
());
a_g_m_k_device_buf
.
ToDevice
(
A
.
mData
.
data
());
b_g_k_n_device_buf
.
ToDevice
(
B
.
mData
.
data
());
const
auto
batch_count
=
A
.
mDesc
.
GetLengths
()[
0
];
auto
invoker_ptr
=
batched_gemm_ptr
->
MakeInvokerPointer
();
auto
argument_ptr
=
batched_gemm_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_g_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_g_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_g_m_n_device_buf
.
GetDeviceBuffer
()),
params
.
M
,
params
.
N
,
params
.
K
,
params
.
StrideA
,
params
.
StrideB
,
params
.
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
,
batch_count
);
if
(
!
batched_gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
);
}
invoker_ptr
->
Run
(
argument_ptr
.
get
());
c_g_m_n_device_buf
.
FromDevice
(
C
.
mData
.
data
());
}
}
// namespace batched_gemm_util
}
// namespace ck
#endif
test/gemm/gemm_util.hpp
View file @
2b27d5fc
...
...
@@ -214,6 +214,11 @@ struct TestGemm
res
=
ck
::
utils
::
check_err
(
c_device
.
mData
,
c_host
.
mData
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
}
else
if
(
std
::
is_same
<
CDataType
,
ck
::
bhalf_t
>::
value
)
{
res
=
ck
::
utils
::
check_err
(
c_device
.
mData
,
c_host
.
mData
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
}
else
if
(
std
::
is_same
<
CDataType
,
int8_t
>::
value
)
{
res
=
ck
::
utils
::
check_err
(
c_device
.
mData
,
c_host
.
mData
);
...
...
@@ -234,121 +239,5 @@ struct TestGemm
}
};
template
<
typename
DeviceGemmPtr_
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
struct
TestGemmBF16
{
using
BF16
=
ck
::
bhalf_t
;
auto
PrepareGemmTensorBF16
(
const
ck
::
gemm_util
::
GemmParams
&
params
)
{
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
};
// use fp32 host kernel to verify bf16 device kernel
Tensor
<
BF16
>
a_m_k_bf16
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
BF16
>
b_k_n_bf16
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
BF16
>
c_m_n_device_bf16
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
float
>
a_m_k_fp32
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
float
>
b_k_n_fp32
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
float
>
c_m_n_host_fp32
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
float
>
c_m_n_device_fp32
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
a_m_k_bf16
.
GenerateTensorValue
(
GeneratorTensor_3
<
BF16
>
{
-
0.5
,
0.5
});
b_k_n_bf16
.
GenerateTensorValue
(
GeneratorTensor_3
<
BF16
>
{
-
0.5
,
0.5
});
bf16_to_f32_
(
a_m_k_bf16
,
a_m_k_fp32
);
bf16_to_f32_
(
b_k_n_bf16
,
b_k_n_fp32
);
return
std
::
make_tuple
(
a_m_k_bf16
,
b_k_n_bf16
,
c_m_n_device_bf16
,
a_m_k_fp32
,
b_k_n_fp32
,
c_m_n_host_fp32
,
c_m_n_device_fp32
);
}
auto
operator
()(
DeviceGemmPtr_
&
gemmPtr
)
{
// Arrange
ck
::
gemm_util
::
GemmParams
params
;
params
.
M
=
1024
;
params
.
N
=
1024
;
params
.
K
=
1024
;
params
.
StrideA
=
1024
;
params
.
StrideB
=
1024
;
params
.
StrideC
=
1024
;
auto
host_tensors
=
PrepareGemmTensorBF16
(
params
);
const
Tensor
<
BF16
>&
a_bf16
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
BF16
>&
b_bf16
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
BF16
>&
c_device_bf16
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
float
>&
a_fp32
=
std
::
get
<
3
>
(
host_tensors
);
Tensor
<
float
>&
b_fp32
=
std
::
get
<
4
>
(
host_tensors
);
Tensor
<
float
>&
c_host_fp32
=
std
::
get
<
5
>
(
host_tensors
);
Tensor
<
float
>&
c_device_fp32
=
std
::
get
<
6
>
(
host_tensors
);
auto
a_element_op
=
AElementwiseOperation
{};
auto
b_element_op
=
BElementwiseOperation
{};
auto
c_element_op
=
CElementwiseOperation
{};
// use fp32 host kernel to verify bf16 device kernel
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
float
,
float
,
float
,
float
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
;
ck
::
gemm_util
::
RunHostGEMM
<
ReferenceGemmInstance
>
(
a_fp32
,
b_fp32
,
c_host_fp32
,
a_element_op
,
b_element_op
,
c_element_op
);
// Act
ck
::
gemm_util
::
RunDeviceGEMM
(
gemmPtr
,
params
,
a_bf16
,
b_bf16
,
c_device_bf16
,
a_element_op
,
b_element_op
,
c_element_op
);
bf16_to_f32_
(
c_device_bf16
,
c_device_fp32
);
// Assert
bool
res
=
ck
::
utils
::
check_err
(
c_device_fp32
.
mData
,
c_host_fp32
.
mData
,
"Error: incorrect results!"
,
1e-2
f
,
1e-3
f
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
;
};
};
}
// namespace gemm_util
}
// namespace ck
test/gemm/gemm_xdl_bf16.cpp
View file @
2b27d5fc
...
...
@@ -47,6 +47,11 @@ void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances(
int
main
()
{
using
ADataType
=
ck
::
bhalf_t
;
using
BDataType
=
ck
::
bhalf_t
;
using
CDataType
=
ck
::
bhalf_t
;
using
AccDataType
=
float
;
using
RowMajor
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
ColumnMajor
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
...
...
@@ -58,13 +63,17 @@ int main()
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemmBF16
<
DeviceGemmNoOpPtr
,
ColumnMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
ColumnMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
gemmPtrs
.
clear
();
...
...
@@ -73,13 +82,17 @@ int main()
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemmBF16
<
DeviceGemmNoOpPtr
,
ColumnMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
ColumnMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
gemmPtrs
.
clear
();
...
...
@@ -88,13 +101,17 @@ int main()
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemmBF16
<
DeviceGemmNoOpPtr
,
RowMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
RowMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
gemmPtrs
.
clear
();
...
...
@@ -103,13 +120,17 @@ int main()
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemmBF16
<
DeviceGemmNoOpPtr
,
RowMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
RowMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
std
::
cout
<<
"TestGemm ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
...
...
test/gemm/gemm_xdl_fp16.cpp
View file @
2b27d5fc
...
...
@@ -38,10 +38,12 @@ void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNo
void
add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
#if 0
void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
#endif
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
...
...
@@ -69,8 +71,10 @@ int main()
std
::
vector
<
DeviceGemmNoOpPtr
>
gemmPtrs
;
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances
(
gemmPtrs
);
#if 0
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(gemmPtrs);
#endif
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
gemmPtrs
);
...
...
@@ -92,8 +96,10 @@ int main()
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances
(
gemmPtrs
);
#if 0
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(gemmPtrs);
#endif
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
gemmPtrs
);
...
...
@@ -115,8 +121,10 @@ int main()
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances
(
gemmPtrs
);
#if 0
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(gemmPtrs);
#endif
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances
(
gemmPtrs
);
...
...
@@ -138,8 +146,10 @@ int main()
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
(
gemmPtrs
);
#if 0
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(gemmPtrs);
#endif
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
...
...
test/gemm/gemm_xdl_fp32.cpp
View file @
2b27d5fc
...
...
@@ -38,10 +38,12 @@ void add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(std::vector<DeviceGemmNo
void
add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
#if 0
void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
#endif
void
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
...
...
@@ -67,8 +69,10 @@ int main()
std
::
vector
<
DeviceGemmNoOpPtr
>
gemmPtrs
;
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances
(
gemmPtrs
);
#if 0
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(gemmPtrs);
#endif
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances
(
gemmPtrs
);
...
...
@@ -90,8 +94,10 @@ int main()
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances
(
gemmPtrs
);
#if 0
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(gemmPtrs);
#endif
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances
(
gemmPtrs
);
...
...
@@ -113,8 +119,10 @@ int main()
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances
(
gemmPtrs
);
#if 0
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(gemmPtrs);
#endif
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances
(
gemmPtrs
);
...
...
@@ -136,8 +144,10 @@ int main()
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances
(
gemmPtrs
);
#if 0
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(gemmPtrs);
#endif
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances
(
gemmPtrs
);
...
...
test/gemm_split_k/CMakeLists.txt
View file @
2b27d5fc
add_test_executable
(
test_gemm_split_k gemm_split_k.cpp
)
target_link_libraries
(
test_gemm_split_k PRIVATE host_tensor
)
target_link_libraries
(
test_gemm_split_k PRIVATE device_gemm_instance
)
target_link_libraries
(
test_gemm_split_k PRIVATE device_gemm_
splitk_
instance
)
test/gemm_split_k/gemm_split_k.cpp
View file @
2b27d5fc
...
...
@@ -15,7 +15,6 @@
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/host_tensor/host_gemm.hpp"
...
...
@@ -28,20 +27,24 @@ enum struct GemmMatrixLayout
KM_NK_MN
,
// 3
};
using
DeviceGemmNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
using
DeviceGemm
SplitK
NoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmSplitKPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_gemm_instance
{
void
add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmSplitKNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmSplitKNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmSplitKNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmSplitKNoOpPtr
>&
);
}
// namespace device_gemm_instance
}
// namespace device
...
...
@@ -150,7 +153,7 @@ int test_gemm(const gemmArgs& args)
c_device_buf
.
ToDevice
(
c_m_n_device_result
.
mData
.
data
());
// add device GEMM instances
std
::
vector
<
DeviceGemmNoOpPtr
>
gemm_ptrs
;
std
::
vector
<
DeviceGemm
SplitK
NoOpPtr
>
gemm_ptrs
;
if
(
args
.
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
...
...
test/softmax/CMakeLists.txt
View file @
2b27d5fc
...
...
@@ -2,7 +2,10 @@ add_custom_target(test_softmax)
add_gtest_executable
(
test_softmax_fp32 test_softmax_fp32.cpp
)
add_gtest_executable
(
test_softmax_fp16 test_softmax_fp16.cpp
)
add_gtest_executable
(
test_softmax_int8 test_softmax_int8.cpp
)
target_link_libraries
(
test_softmax_fp32 PRIVATE host_tensor
)
target_link_libraries
(
test_softmax_fp16 PRIVATE host_tensor
)
target_link_libraries
(
test_softmax_int8 PRIVATE host_tensor
)
add_dependencies
(
test_softmax test_softmax_fp32
)
add_dependencies
(
test_softmax test_softmax_fp16
)
\ No newline at end of file
add_dependencies
(
test_softmax test_softmax_fp16
)
add_dependencies
(
test_softmax test_softmax_int8
)
\ No newline at end of file
test/softmax/test_softmax_fp16.cpp
View file @
2b27d5fc
...
...
@@ -15,14 +15,19 @@ class TestSoftmaxFP16 : public ck::TestSoftmax<Tuple>
// clang-format off
using
KernelTypes
=
::
testing
::
Types
<
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
std
::
tuple
<
ck
::
half_t
,
float
,
float
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
4
>>
,
// mixed precision
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
32
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
32
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
>
;
// clang-format on
TYPED_TEST_SUITE
(
TestSoftmaxFP16
,
KernelTypes
);
...
...
test/softmax/test_softmax_fp32.cpp
View file @
2b27d5fc
...
...
@@ -15,14 +15,19 @@ class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple>
// clang-format off
using
KernelTypes
=
::
testing
::
Types
<
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
std
::
tuple
<
float
,
float
,
ck
::
half_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
8
>>
,
// mixed precision
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
4
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
4
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
4
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
4
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
4
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
4
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
4
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
4
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
4
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>>
>
;
// clang-format on
TYPED_TEST_SUITE
(
TestSoftmaxFP32
,
KernelTypes
);
...
...
test/softmax/test_softmax_int8.cpp
0 → 100644
View file @
2b27d5fc
#include "gtest/gtest.h"
#include "test_softmax_util.hpp"
template
<
ck
::
index_t
N
>
using
I
=
ck
::
Number
<
N
>
;
template
<
typename
Tuple
>
class
TestSoftmaxINT8
:
public
ck
::
TestSoftmax
<
Tuple
>
{
};
// clang-format off
using
KernelTypes
=
::
testing
::
Types
<
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
32
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
64
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
16
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
32
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
,
std
::
tuple
<
int8_t
,
float
,
int8_t
,
I
<
3
>
,
I
<
2
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
64
>
,
I
<
1
>
,
I
<
16
>
,
I
<
16
>>
>
;
// clang-format on
TYPED_TEST_SUITE
(
TestSoftmaxINT8
,
KernelTypes
);
TYPED_TEST
(
TestSoftmaxINT8
,
Test_INT8
)
{
this
->
Run
();
}
test/softmax/test_softmax_util.hpp
View file @
2b27d5fc
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <iostream>
#include <gtest/gtest.h>
...
...
@@ -16,6 +18,18 @@
namespace
ck
{
template
<
typename
Range
>
std
::
string
serialize_range
(
const
Range
&
range
)
{
std
::
stringstream
ss
;
for
(
auto
&
r
:
range
)
{
ss
<<
r
<<
", "
;
}
std
::
string
str
=
ss
.
str
();
return
std
::
string
(
str
.
begin
(),
str
.
end
()
-
2
);
}
template
<
typename
Tuple
>
class
TestSoftmax
:
public
::
testing
::
Test
{
...
...
@@ -80,23 +94,43 @@ class TestSoftmax : public ::testing::Test
auto
argument_ptr
=
device_instance
.
MakeArgumentPointer
(
i_in_lengths
,
i_in_strides
,
reduce_dims
,
alpha
,
beta
,
&
alpha
,
&
beta
,
in_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
());
if
(
!
device_instance
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
FAIL
()
<<
"Unsupported argument"
;
// std::cout << "Skipped due to unsupported argument: "
// << "input lengths = [" << serialize_range(in_length) << "], "
// << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
return
;
}
auto
invoker_ptr
=
device_instance
.
MakeInvokerPointer
();
invoker_ptr
->
Run
(
argument_ptr
.
get
());
ref_instance_invoker_
.
Run
({
in
,
out_ref
,
alpha
,
beta
,
Rank
,
reduce_dims
});
ref_instance_invoker_
.
Run
({
in
,
out_ref
,
alpha
,
beta
,
reduce_dims
});
out_dev
.
FromDevice
(
out
.
mData
.
data
());
EXPECT_TRUE
(
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
));
bool
pass
;
if
(
std
::
is_same
<
InDataType
,
int8_t
>::
value
)
{
EXPECT_TRUE
(
pass
=
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
,
"Error: Incorrect results!"
,
0
,
1
));
}
else
{
EXPECT_TRUE
(
pass
=
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
));
}
if
(
!
pass
)
{
FAIL
()
<<
"Failure in input lengths = ["
<<
serialize_range
(
in_length
)
<<
"], "
<<
"scaler = ["
<<
alpha
<<
", "
<<
beta
<<
"]."
;
}
}
void
Run
()
...
...
@@ -105,13 +139,14 @@ class TestSoftmax : public ::testing::Test
{
for
(
auto
scale
:
this
->
scales_
)
{
this
->
RunSingle
(
in_length
,
s
td
::
get
<
0
>
(
scale
),
std
::
get
<
1
>
(
scale
)
);
this
->
RunSingle
(
in_length
,
s
cale
[
0
],
scale
[
1
]
);
}
}
}
std
::
vector
<
std
::
vector
<
index_t
>>
in_lengths_
=
{{
1
,
8
,
128
},
{
2
,
128
,
1024
},
{
3
,
9
,
1032
}};
std
::
vector
<
std
::
tuple
<
AccDataType
,
AccDataType
>>
scales_
=
{{
1
,
0
},
{
2
,
2
},
{
0
,
1
}};
std
::
vector
<
std
::
vector
<
index_t
>>
in_lengths_
=
{
{
1
,
8
,
128
},
{
2
,
128
,
1024
},
{
3
,
9
,
1032
},
{
4
,
4
,
2048
},
{
8
,
1
,
8192
}};
std
::
vector
<
std
::
vector
<
AccDataType
>>
scales_
=
{{
1
,
0
},
{
1
,
1
},
{
0
,
1
},
{
2
,
2
}};
typename
ReferenceInstance
::
Invoker
ref_instance_invoker_
;
};
...
...
Prev
1
…
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment