Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b7a6f810
Commit
b7a6f810
authored
Mar 10, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into fix_threadwise_copy_error_in_reduction
parents
b29dfd70
827301d9
Changes
70
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
318 additions
and
302 deletions
+318
-302
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
...reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
+28
-28
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
.../reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
+19
-19
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
.../reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
+9
-9
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
.../reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
+27
-27
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
.../reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
+9
-9
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
.../reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
+27
-27
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+83
-83
profiler/src/profile_reduce.cpp
profiler/src/profile_reduce.cpp
+13
-13
script/profile_reduce_no_index.sh
script/profile_reduce_no_index.sh
+53
-45
script/profile_reduce_with_index.sh
script/profile_reduce_with_index.sh
+50
-42
No files found.
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
View file @
b7a6f810
...
@@ -6,37 +6,37 @@ namespace device {
...
@@ -6,37 +6,37 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
// Will be moved to use MultiBlockAtomicAdd
// Will be moved to use MultiBlockAtomicAdd
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
View file @
b7a6f810
...
@@ -6,25 +6,25 @@ namespace device {
...
@@ -6,25 +6,25 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
View file @
b7a6f810
...
@@ -6,16 +6,16 @@ namespace device {
...
@@ -6,16 +6,16 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
View file @
b7a6f810
...
@@ -6,34 +6,34 @@ namespace device {
...
@@ -6,34 +6,34 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
View file @
b7a6f810
...
@@ -6,16 +6,16 @@ namespace device {
...
@@ -6,16 +6,16 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
View file @
b7a6f810
...
@@ -6,34 +6,34 @@ namespace device {
...
@@ -6,34 +6,34 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
profiler/include/profile_reduce_impl.hpp
View file @
b7a6f810
...
@@ -9,54 +9,52 @@ namespace tensor_operation {
...
@@ -9,54 +9,52 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
template
<
int
Rank
,
typename
ReduceDim
s
,
int
ReduceOpId
,
int
NanOpt
,
int
IndicesOpt
>
template
<
int
Rank
,
int
Num
ReduceDim
,
int
ReduceOpId
,
int
NanOpt
,
int
IndicesOpt
>
struct
ReduceDescription
struct
ReduceDescription
{
{
static
constexpr
int
Rank_
=
Rank
;
static
constexpr
int
Rank_
=
Rank
;
static
constexpr
int
ReduceOpId_
=
ReduceOpId
;
static
constexpr
int
NumReduceDim_
=
NumReduceDim
;
static
constexpr
int
NanOpt_
=
NanOpt
;
static
constexpr
int
ReduceOpId_
=
ReduceOpId
;
static
constexpr
int
IndicesOpt_
=
IndicesOpt
;
static
constexpr
int
NanOpt_
=
NanOpt
;
static
constexpr
int
IndicesOpt_
=
IndicesOpt
;
using
ReduceDims_
=
ReduceDims
;
};
};
using
reduce_description_instances
=
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
0
,
0
>
,
// for ADD
std
::
tuple
<
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
0
,
0
,
0
>
,
// for ADD
ReduceDescription
<
4
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
>
,
0
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
5
,
0
,
0
>
,
// for AVG
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
5
,
0
,
0
>
,
// for AVG
ReduceDescription
<
4
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
>
,
5
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
7
,
0
,
0
>
,
// for NORM2
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
7
,
0
,
0
>
,
// for NORM2
ReduceDescription
<
4
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
>
,
7
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
0
>
,
// for MIN
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
2
,
0
,
0
>
,
// for MIN
ReduceDescription
<
4
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
>
,
2
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
0
>
,
// for MAX
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
3
,
0
,
0
>
,
// for MAX
ReduceDescription
<
4
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
>
,
3
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
0
>
,
// for AMAX
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
4
,
0
,
0
>
,
// for AMAX
ReduceDescription
<
4
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
>
,
4
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
1
>
,
// for MIN
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
2
,
0
,
1
>
,
// for MIN
ReduceDescription
<
4
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
Sequence
<
0
>
,
2
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
1
>
,
// for MAX
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
3
,
0
,
1
>
,
// for MAX
ReduceDescription
<
4
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
Sequence
<
0
>
,
3
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
1
>
,
// for AMAX
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
4
,
0
,
1
>
,
// for AMAX
ReduceDescription
<
4
,
1
,
4
,
0
,
1
>
,
ReduceDescription
<
4
,
Sequence
<
0
>
,
4
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
1
>>
;
ReduceDescription
<
2
,
Sequence
<
1
>
,
4
,
0
,
1
>>
;
template
<
typename
DescriptionType
>
template
<
typename
DescriptionType
>
bool
description_match
(
const
DescriptionType
&
description
,
bool
description_match
(
const
DescriptionType
&
description
,
int
Rank
,
int
Rank
,
const
std
::
vector
<
int
>&
R
educeDims
,
const
std
::
vector
<
int
>&
r
educeDims
,
ReduceTensorOp_t
ReduceOpId
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
)
ReduceTensorIndices_t
IndicesOpt
)
...
@@ -66,16 +64,11 @@ bool description_match(const DescriptionType& description,
...
@@ -66,16 +64,11 @@ bool description_match(const DescriptionType& description,
description
.
IndicesOpt_
!=
static_cast
<
int
>
(
IndicesOpt
))
description
.
IndicesOpt_
!=
static_cast
<
int
>
(
IndicesOpt
))
return
(
false
);
return
(
false
);
if
(
DescriptionType
::
ReduceDim
s_
::
Size
()
!=
R
educeDims
.
size
())
if
(
DescriptionType
::
Num
ReduceDim
_
!=
r
educeDims
.
size
())
return
(
false
);
return
(
false
);
bool
result
=
true
;
bool
result
=
true
;
static_for
<
0
,
DescriptionType
::
ReduceDims_
::
Size
(),
1
>
{}([
&
](
auto
i
)
{
if
(
DescriptionType
::
ReduceDims_
::
At
(
i
)
!=
ReduceDims
[
i
])
result
=
false
;
});
return
(
result
);
return
(
result
);
};
};
...
@@ -87,33 +80,29 @@ bool description_match(const DescriptionType& description,
...
@@ -87,33 +80,29 @@ bool description_match(const DescriptionType& description,
namespace
ck
{
namespace
ck
{
namespace
profiler
{
namespace
profiler
{
template
<
int
Rank
,
typename
ReduceDims
>
template
<
index_t
Rank
,
index_t
NumReduceDim
>
static
std
::
vector
<
int
>
get_reduce_dims
()
static
inline
std
::
vector
<
int
>
get_invariant_dims
(
const
std
::
vector
<
int
>&
reduceDims
)
{
std
::
vector
<
int
>
resDims
;
static_for
<
0
,
ReduceDims
::
Size
(),
1
>
{}([
&
](
auto
i
)
{
resDims
.
push_back
(
ReduceDims
::
At
(
i
));
});
return
(
resDims
);
};
template
<
int
Rank
,
typename
ReduceDims
>
static
std
::
vector
<
int
>
get_invariant_dims
()
{
{
std
::
vector
<
int
>
resDims
;
assert
(
NumReduceDim
==
reduceDims
.
size
());
unsigned
int
incFlag
=
0
;
static_for
<
0
,
ReduceDims
::
Size
(),
1
>
{}(
int
reduceFlag
=
0
;
[
&
](
auto
i
)
{
incFlag
=
incFlag
|
(
0x1
<<
ReduceDims
::
At
(
i
));
});
for
(
int
dim
=
0
;
dim
<
Rank
;
dim
++
)
// flag the bits for the reduceDims
for
(
int
i
=
0
;
i
<
NumReduceDim
;
i
++
)
{
{
if
(
incFlag
&
(
0x1
<<
dim
))
reduceFlag
|=
1
<<
reduceDims
[
i
];
continue
;
resDims
.
push_back
(
dim
);
};
};
return
(
resDims
);
std
::
vector
<
int
>
invariantDims
;
// collect invariant dimensions
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
{
invariantDims
.
push_back
(
i
);
};
return
invariantDims
;
};
};
template
<
typename
T
>
template
<
typename
T
>
...
@@ -149,7 +138,7 @@ template <typename InDataType,
...
@@ -149,7 +138,7 @@ template <typename InDataType,
typename
AccDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
typename
ReduceDim
s_
,
int
Num
ReduceDim
,
ReduceTensorOp_t
ReduceOpId
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
>
ReduceTensorIndices_t
IndicesOpt
>
...
@@ -159,6 +148,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -159,6 +148,7 @@ void profile_reduce_impl_impl(bool do_verification,
bool
do_dumpout
,
bool
do_dumpout
,
int
nrepeat
,
int
nrepeat
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
)
float
beta
)
{
{
...
@@ -203,15 +193,14 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -203,15 +193,14 @@ void profile_reduce_impl_impl(bool do_verification,
{
{
Tensor
<
InDataType
>
in
(
inLengths
);
Tensor
<
InDataType
>
in
(
inLengths
);
const
std
::
vector
<
int
>
OuterDims
=
get_invariant_dims
<
Rank
,
ReduceDims_
>
();
const
std
::
vector
<
int
>
ReduceDims
=
get_reduce_dims
<
Rank
,
ReduceDims_
>
();
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
size_t
>
outLengths
;
if
(
OuterDims
.
empty
())
const
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
if
(
reduceDims
.
size
()
==
Rank
)
outLengths
.
push_back
(
1
);
outLengths
.
push_back
(
1
);
else
else
for
(
auto
dim
:
Outer
Dims
)
for
(
auto
dim
:
invariant
Dims
)
outLengths
.
push_back
(
inLengths
[
dim
]);
outLengths
.
push_back
(
inLengths
[
dim
]);
Tensor
<
OutDataType
>
out_ref
(
outLengths
);
Tensor
<
OutDataType
>
out_ref
(
outLengths
);
...
@@ -302,7 +291,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -302,7 +291,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
ReduceDim
s_
,
Num
ReduceDim
,
ReduceOpId
,
ReduceOpId
,
NanOpt
,
NanOpt
,
IndicesOpt
>
(
reduce0_ptrs
);
IndicesOpt
>
(
reduce0_ptrs
);
...
@@ -311,7 +300,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -311,7 +300,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
ReduceDim
s_
,
Num
ReduceDim
,
ReduceOpId
,
ReduceOpId
,
NanOpt
,
NanOpt
,
IndicesOpt
>
(
reduce0_ptrs
);
IndicesOpt
>
(
reduce0_ptrs
);
...
@@ -321,7 +310,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -321,7 +310,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
ReduceDim
s_
,
Num
ReduceDim
,
ReduceOpId
,
ReduceOpId
,
NanOpt
,
NanOpt
,
IndicesOpt
>
(
reduce0_ptrs
);
IndicesOpt
>
(
reduce0_ptrs
);
...
@@ -330,7 +319,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -330,7 +319,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
ReduceDim
s_
,
Num
ReduceDim
,
ReduceOpId
,
ReduceOpId
,
NanOpt
,
NanOpt
,
IndicesOpt
>
(
reduce1_ptrs
);
IndicesOpt
>
(
reduce1_ptrs
);
...
@@ -341,7 +330,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -341,7 +330,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
ReduceDim
s_
,
Num
ReduceDim
,
ReduceOpId
,
ReduceOpId
,
NanOpt
,
NanOpt
,
IndicesOpt
>
(
reduce2_ptrs
);
IndicesOpt
>
(
reduce2_ptrs
);
...
@@ -358,7 +347,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -358,7 +347,7 @@ void profile_reduce_impl_impl(bool do_verification,
using
hCompType
=
typename
type_mapping
<
AccDataType
>::
outDataType
;
using
hCompType
=
typename
type_mapping
<
AccDataType
>::
outDataType
;
ReductionHost
<
hInType
,
hCompType
,
hOutType
,
ReduceOpId
,
PropagateNan
,
NeedIndices
>
ReductionHost
<
hInType
,
hCompType
,
hOutType
,
ReduceOpId
,
PropagateNan
,
NeedIndices
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
Outer
Dims
,
R
educeDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariant
Dims
,
r
educeDims
);
hostReduce
.
Run
(
alpha
,
hostReduce
.
Run
(
alpha
,
reinterpret_cast
<
const
hInType
*>
(
in
.
mData
.
data
()),
reinterpret_cast
<
const
hInType
*>
(
in
.
mData
.
data
()),
...
@@ -383,6 +372,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -383,6 +372,7 @@ void profile_reduce_impl_impl(bool do_verification,
i_inStrides
,
i_inStrides
,
i_outLengths
,
i_outLengths
,
i_outStrides
,
i_outStrides
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
in_dev
.
GetDeviceBuffer
(),
in_dev
.
GetDeviceBuffer
(),
...
@@ -464,6 +454,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -464,6 +454,7 @@ void profile_reduce_impl_impl(bool do_verification,
i_inStrides
,
i_inStrides
,
i_outLengths
,
i_outLengths
,
i_outStrides
,
i_outStrides
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
in_dev
.
GetDeviceBuffer
(),
in_dev
.
GetDeviceBuffer
(),
...
@@ -496,6 +487,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -496,6 +487,7 @@ void profile_reduce_impl_impl(bool do_verification,
inStrides2
,
inStrides2
,
i_outLengths
,
i_outLengths
,
i_outStrides
,
i_outStrides
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
ws_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
...
@@ -584,7 +576,7 @@ void profile_reduce_impl(bool do_verification,
...
@@ -584,7 +576,7 @@ void profile_reduce_impl(bool do_verification,
bool
do_dumpout
,
bool
do_dumpout
,
int
nrepeat
,
int
nrepeat
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
R
educeDims
,
const
std
::
vector
<
int
>&
r
educeDims
,
ReduceTensorOp_t
ReduceOpId
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
,
ReduceTensorIndices_t
IndicesOpt
,
...
@@ -605,18 +597,26 @@ void profile_reduce_impl(bool do_verification,
...
@@ -605,18 +597,26 @@ void profile_reduce_impl(bool do_verification,
using
descType
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
>
(
tuple_object
))
>
;
using
descType
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
>
(
tuple_object
))
>
;
if
(
!
description_match
(
if
(
!
description_match
(
descType
{},
inLengths
.
size
(),
R
educeDims
,
ReduceOpId
,
NanOpt
,
IndicesOpt
))
descType
{},
inLengths
.
size
(),
r
educeDims
,
ReduceOpId
,
NanOpt
,
IndicesOpt
))
return
;
return
;
profile_reduce_impl_impl
<
InDataType
,
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
descType
::
Rank_
,
descType
::
Rank_
,
typename
descType
::
ReduceDim
s
_
,
descType
::
Num
ReduceDim_
,
static_cast
<
ReduceTensorOp_t
>
(
descType
::
ReduceOpId_
),
static_cast
<
ReduceTensorOp_t
>
(
descType
::
ReduceOpId_
),
static_cast
<
NanPropagation_t
>
(
descType
::
NanOpt_
),
static_cast
<
NanPropagation_t
>
(
descType
::
NanOpt_
),
static_cast
<
ReduceTensorIndices_t
>
(
descType
::
IndicesOpt_
)
>
(
static_cast
<
ReduceTensorIndices_t
>
(
descType
::
IndicesOpt_
)
>
(
do_verification
,
init_method
,
do_log
,
do_dumpout
,
nrepeat
,
inLengths
,
alpha
,
beta
);
do_verification
,
init_method
,
do_log
,
do_dumpout
,
nrepeat
,
inLengths
,
reduceDims
,
alpha
,
beta
);
matched
=
true
;
matched
=
true
;
});
});
...
...
profiler/src/profile_reduce.cpp
View file @
b7a6f810
...
@@ -25,7 +25,7 @@ using ck::ReduceTensorIndices_t;
...
@@ -25,7 +25,7 @@ using ck::ReduceTensorIndices_t;
using
ck
::
ReduceTensorOp_t
;
using
ck
::
ReduceTensorOp_t
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"
toR
educeDims"
,
required_argument
,
nullptr
,
'R'
},
{
"
r
educeDims"
,
required_argument
,
nullptr
,
'R'
},
{
"reduceOp"
,
required_argument
,
nullptr
,
'O'
},
{
"reduceOp"
,
required_argument
,
nullptr
,
'O'
},
{
"compType"
,
required_argument
,
nullptr
,
'C'
},
{
"compType"
,
required_argument
,
nullptr
,
'C'
},
{
"outType"
,
required_argument
,
nullptr
,
'W'
},
{
"outType"
,
required_argument
,
nullptr
,
'W'
},
...
@@ -93,9 +93,9 @@ typedef enum
...
@@ -93,9 +93,9 @@ typedef enum
appDouble
=
6
,
appDouble
=
6
,
}
appDataType_t
;
}
appDataType_t
;
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
toR
educeDims
)
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
r
educeDims
)
{
{
for
(
auto
dim
:
toR
educeDims
)
for
(
auto
dim
:
r
educeDims
)
{
{
if
(
dim
<
0
||
dim
>=
rank
)
if
(
dim
<
0
||
dim
>=
rank
)
throw
std
::
runtime_error
(
"Invalid dimension index specified for Reducing"
);
throw
std
::
runtime_error
(
"Invalid dimension index specified for Reducing"
);
...
@@ -103,7 +103,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& toReduceDi
...
@@ -103,7 +103,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& toReduceDi
unsigned
int
flag
=
0
;
unsigned
int
flag
=
0
;
for
(
auto
dim
:
toR
educeDims
)
for
(
auto
dim
:
r
educeDims
)
{
{
if
(
flag
&
(
0x1
<<
dim
))
if
(
flag
&
(
0x1
<<
dim
))
throw
std
::
runtime_error
(
"All toReduce dimensions should be different!"
);
throw
std
::
runtime_error
(
"All toReduce dimensions should be different!"
);
...
@@ -122,7 +122,7 @@ class AppArgs
...
@@ -122,7 +122,7 @@ class AppArgs
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
int
>
toR
educeDims
;
std
::
vector
<
int
>
r
educeDims
;
std
::
vector
<
float
>
scales
;
std
::
vector
<
float
>
scales
;
...
@@ -152,7 +152,7 @@ class AppArgs
...
@@ -152,7 +152,7 @@ class AppArgs
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths"
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"--
toR
educeDims or -R, comma separated list of to-reduce dimensions"
std
::
cout
<<
"--
r
educeDims or -R, comma separated list of to-reduce dimensions"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"--reduceOp or -O, enum value indicating the reduction operations"
std
::
cout
<<
"--reduceOp or -O, enum value indicating the reduction operations"
<<
std
::
endl
;
<<
std
::
endl
;
...
@@ -201,7 +201,7 @@ class AppArgs
...
@@ -201,7 +201,7 @@ class AppArgs
if
(
!
optarg
)
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
throw
std
::
runtime_error
(
"Invalid option format!"
);
toR
educeDims
=
getTypeValuesFromString
<
int
>
(
optarg
);
r
educeDims
=
getTypeValuesFromString
<
int
>
(
optarg
);
break
;
break
;
case
'O'
:
case
'O'
:
if
(
!
optarg
)
if
(
!
optarg
)
...
@@ -321,7 +321,7 @@ int profile_reduce(int argc, char* argv[])
...
@@ -321,7 +321,7 @@ int profile_reduce(int argc, char* argv[])
int
rank
=
args
.
inLengths
.
size
();
int
rank
=
args
.
inLengths
.
size
();
check_reduce_dims
(
rank
,
args
.
toR
educeDims
);
check_reduce_dims
(
rank
,
args
.
r
educeDims
);
if
(
args
.
reduceOp
==
ReduceTensorOp_t
::
MUL
||
args
.
reduceOp
==
ReduceTensorOp_t
::
NORM1
)
if
(
args
.
reduceOp
==
ReduceTensorOp_t
::
MUL
||
args
.
reduceOp
==
ReduceTensorOp_t
::
NORM1
)
throw
std
::
runtime_error
(
"MUL and NORM1 are not supported by composable kernel!"
);
throw
std
::
runtime_error
(
"MUL and NORM1 are not supported by composable kernel!"
);
...
@@ -345,7 +345,7 @@ int profile_reduce(int argc, char* argv[])
...
@@ -345,7 +345,7 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
indicesOpt
,
...
@@ -360,7 +360,7 @@ int profile_reduce(int argc, char* argv[])
...
@@ -360,7 +360,7 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
indicesOpt
,
...
@@ -378,7 +378,7 @@ int profile_reduce(int argc, char* argv[])
...
@@ -378,7 +378,7 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
indicesOpt
,
...
@@ -395,7 +395,7 @@ int profile_reduce(int argc, char* argv[])
...
@@ -395,7 +395,7 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
indicesOpt
,
...
@@ -410,7 +410,7 @@ int profile_reduce(int argc, char* argv[])
...
@@ -410,7 +410,7 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
indicesOpt
,
...
...
script/profile_reduce_no_index.sh
View file @
b7a6f810
#!/bin/bash
#!/bin/bash
PRECISION
=
##--half
PRECISION
=
##PRECISION=--half
##PRECISION=--double
if
test
-n
$PRECISION
&&
test
"
$PRECISION
"
=
"--half"
;
then
if
test
-n
$PRECISION
&&
test
"
$PRECISION
"
=
"--half"
;
then
CTYPE
=
"-C 1"
AC
CTYPE
=
"-C 1"
else
else
CTYPE
=
""
AC
CTYPE
=
""
fi
fi
WTYPE
=
driver
=
"./bin/ckProfiler"
VERIFY
=
"-v
$1
"
INIT
=
$2
NREPEAT
=
$3
if
[
$#
-ge
1
]
;
then
NREPEAT
=
$1
else
NREPEAT
=
1
fi
Operation
=
7
#### 0 - ADD, 5 - AVG, 7 - NORM2
Operations
=
"0 5 7"
## for generic validation
## for generic validation
for
op
in
$Operation
;
do
for
op
in
$Operation
s
;
do
set
-x
set
-x
./bin/ckProfiler reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
####### datatype layout reduce dims op acctype verify init repeats
./bin/ckProfiler reduce
$PRECISION
-D
4,64,280,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
280,4,64,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
4,64,280,82
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
64,280,82,4
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,8192
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,1024
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,4
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,22960
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,22960
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
4,1469440
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
4,1469440
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
set
+x
set
+x
done
done
Operation
=
5
#### 0 - ADD, 5 - AVG, 7 - NORM2
Operations
=
5
## for performance evaluation (resnet50 NHWC => C)
## for performance evaluation (resnet50 NHWC => C)
for
op
in
$Operation
;
do
for
op
in
$Operation
s
;
do
set
-x
set
-x
./bin/ckProfiler reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
####### datatype layout reduce dims op acctype verify init repeats
./bin/ckProfiler reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
set
+x
set
+x
done
done
script/profile_reduce_with_index.sh
View file @
b7a6f810
#!/bin/bash
#!/bin/bash
PRECISION
=
##--half
PRECISION
=
##PRECISION=--half
##PRECISION=--double
if
[
$#
-ge
1
]
;
then
driver
=
"./bin/ckProfiler"
NREPEAT
=
$1
else
NREPEAT
=
1
fi
Operation
=
4
VERIFY
=
"-v
$1
"
INIT
=
$2
NREPEAT
=
$3
LENGTHS
=
64,4,280,82
#### 2 - MIN, 3 - MAX, 4 - AMAX
Operations
=
"2 4"
## for generic validation
## for generic validation
for
op
in
$Operation
;
do
for
op
in
$Operation
s
;
do
for
use_idx
in
0 1
;
do
for
use_idx
in
0 1
;
do
set
-x
set
-x
./bin/ckProfiler reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
####### datatype layout reduce dims op use index verify init repeats
./bin/ckProfiler reduce
$PRECISION
-D
4,64,280,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
280,4,64,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
4,64,280,82
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
64,280,82,4
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,8192
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,1024
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,4
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,22960
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,22960
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
4,1469440
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
4,1469440
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
set
+x
set
+x
done
done
done
done
Operations
=
2
## for performance evaluation (resnet50 NHWC => C)
## for performance evaluation (resnet50 NHWC => C)
for
op
in
$Operation
;
do
for
op
in
$Operation
s
;
do
for
use_idx
in
0 1
;
do
for
use_idx
in
0 1
;
do
set
-x
set
-x
./bin/ckProfiler reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
####### datatype layout reduce dims op use index verify init repeats
./bin/ckProfiler reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
$driver
reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
set
+x
set
+x
done
done
done
done
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment