Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
dd6a8de4
"configs/_base_/datasets/scannet-3d.py" did not exist on "aee606fc3d89245dc5fa6bb4c70da0f2e6f74933"
Commit
dd6a8de4
authored
Apr 06, 2022
by
Jehandad Khan
Browse files
Merge branch 'develop' into jd/dev_pkg
parents
0aa899aa
abf4bdb9
Changes
470
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1668 additions
and
118 deletions
+1668
-118
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
...ce_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
+40
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp
.../reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp
+53
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
.../reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
+25
-19
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
.../reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
+12
-9
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
.../reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
+36
-27
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
.../reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
+12
-9
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
.../reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
+36
-27
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp
...pu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp
+25
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp
...gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp
+40
-0
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+13
-3
profiler/README.md
profiler/README.md
+48
-0
profiler/include/profile_batched_gemm_impl.hpp
profiler/include/profile_batched_gemm_impl.hpp
+195
-15
profiler/include/profile_batched_gemm_reduce_impl.hpp
profiler/include/profile_batched_gemm_reduce_impl.hpp
+354
-0
profiler/include/profile_conv_bwd_data_impl.hpp
profiler/include/profile_conv_bwd_data_impl.hpp
+10
-5
profiler/include/profile_conv_bwd_weight_impl.hpp
profiler/include/profile_conv_bwd_weight_impl.hpp
+275
-0
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+4
-1
profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
...er/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
+3
-1
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+3
-1
profiler/include/profile_conv_fwd_impl.hpp
profiler/include/profile_conv_fwd_impl.hpp
+4
-1
profiler/include/profile_convnd_bwd_data_impl.hpp
profiler/include/profile_convnd_bwd_data_impl.hpp
+480
-0
No files found.
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
0 → 100644
View file @
dd6a8de4
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp
0 → 100644
View file @
dd6a8de4
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
7
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
bhalf_t
,
float
,
bhalf_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
View file @
dd6a8de4
...
...
@@ -6,25 +6,31 @@ namespace device {
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
//
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
View file @
dd6a8de4
...
...
@@ -6,16 +6,19 @@ namespace device {
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
);
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
View file @
dd6a8de4
...
...
@@ -6,34 +6,43 @@ namespace device {
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
);
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
View file @
dd6a8de4
...
...
@@ -6,16 +6,19 @@ namespace device {
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
);
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
View file @
dd6a8de4
...
...
@@ -6,34 +6,43 @@ namespace device {
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
);
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp
0 → 100644
View file @
dd6a8de4
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp
0 → 100644
View file @
dd6a8de4
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
4
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/CMakeLists.txt
View file @
dd6a8de4
...
...
@@ -15,6 +15,7 @@ include_directories(BEFORE
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/tensor_operation_instance/gpu/reduce
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/utility
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
...
...
@@ -26,18 +27,23 @@ set(PROFILER_SOURCE
src/profile_gemm_bias_2d.cpp
src/profile_gemm_bias_relu.cpp
src/profile_gemm_bias_relu_add.cpp
src/profile_gemm_reduce.cpp
src/profile_batched_gemm.cpp
src/profile_conv_fwd.cpp
src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_fwd_bias_relu_atomic_add.cpp
src/profile_conv_bwd_data.cpp
src/profile_conv
nd
_bwd_data.cpp
src/profile_reduce.cpp
src/profile_grouped_gemm.cpp
src/profile_conv_bwd_weight.cpp
src/profile_batched_gemm_reduce.cpp
)
add_executable
(
ckProfiler
${
PROFILER_SOURCE
}
)
target_link_libraries
(
ckProfiler PRIVATE host_tensor
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bias2d_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bias_relu_instance
)
...
...
@@ -47,7 +53,11 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv
2
d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv
n
d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
set_target_properties
(
ckProfiler PROPERTIES EXCLUDE_FROM_ALL 1
)
\ No newline at end of file
set_target_properties
(
ckProfiler PROPERTIES EXCLUDE_FROM_ALL 1
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_grouped_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_reduce_instance
)
profiler/
src/
README.md
→
profiler/README.md
View file @
dd6a8de4
## Docker script
```
bash
docker run
\
-it
\
--rm
\
--privileged
\
--group-add
sudo
\
-w
/root/workspace
\
-v
${
PATH_TO_LOCAL_WORKSPACE
}
:/root/workspace
\
rocm/tensorflow:rocm4.3.1-tf2.6-dev
\
/bin/bash
```
## Build ```ckProfiler```
```
bash
mkdir
build
&&
cd
build
```
```
bash
# Need to Specify target ID, example below is gfx908
cmake
\
-D
BUILD_DEV
=
OFF
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
..
```
```
bash
make
-j
ckProfiler
```
## Profile GEMM kernels
```
bash
#arg1: tensor operation (gemm=GEMM)
...
...
@@ -42,8 +9,8 @@ cmake \
#arg7: run kernel # of times (>1)
#arg8 to 13: M, N, K, StrideA, StrideB, StrideC
################
#####
op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
./
profiler
/ckProfiler gemm 1 1 1 1 0 5 3840 4096 4096 4096 4096 4096
################
op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
./
bin
/ckProfiler
gemm 1 1 1 1 0 5 3840 4096 4096 4096 4096 4096
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
...
...
@@ -55,7 +22,7 @@ c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
```
## Profile forward convolution kernels
## Profile
2d
forward convolution kernels
```
bash
#arg1: tensor operation (conv=Convolution)
#arg2: data type (0=fp32, 1=fp16)
...
...
@@ -67,8 +34,8 @@ Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
#arg8: print matrix value (0=no, 1=yes)
#arg9: run kernel # of times (>1)
#arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
################
#####
op
datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
./
profiler
/ckProfiler conv
1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
################
op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
./
bin
/ckProfiler
conv
2d_fwd
1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
...
...
profiler/include/profile_batched_gemm_impl.hpp
View file @
dd6a8de4
#pragma once
#include <memory>
#include "check_err.hpp"
#include "config.hpp"
#include "element_wise_operation.hpp"
#include "tensor_layout.hpp"
#include "device.hpp"
#include "host_tensor_generator.hpp"
#include "device_gemm.hpp"
#include "reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -11,10 +21,30 @@ using DeviceGemmNoOpPtr =
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
void
add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
}
// namespace device_batched_gemm_instance
}
// namespace device
...
...
@@ -30,7 +60,7 @@ template <typename ADataType,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
void
profile_batched_gemm_impl
(
int
do_verification
,
bool
profile_batched_gemm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
...
...
@@ -42,6 +72,8 @@ void profile_batched_gemm_impl(int do_verification,
int
StrideC
,
int
BatchCount
=
1
)
{
bool
pass
=
true
;
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count
,
std
::
size_t
row
,
std
::
size_t
col
,
...
...
@@ -65,12 +97,14 @@ void profile_batched_gemm_impl(int do_verification,
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
std
::
unique_ptr
<
Tensor
<
float
>>
c_f32_g_m_n_host_result
=
nullptr
;
std
::
unique_ptr
<
Tensor
<
float
>>
c_f32_g_m_n_device_result
=
nullptr
;
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_g_m_n: "
<<
c_g_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
break
;
...
...
@@ -95,21 +129,56 @@ void profile_batched_gemm_impl(int do_verification,
if
(
do_verification
)
{
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
if
constexpr
(
is_same
<
ADataType
,
ck
::
bhalf_t
>::
value
&&
is_same
<
BDataType
,
ck
::
bhalf_t
>::
value
&&
is_same
<
CDataType
,
ck
::
bhalf_t
>::
value
)
{
Tensor
<
float
>
a_f32_g_m_k
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
K
,
StrideA
,
ALayout
{}));
Tensor
<
float
>
b_f32_g_k_n
(
f_host_tensor_descriptor
(
BatchCount
,
K
,
N
,
StrideB
,
BLayout
{}));
c_f32_g_m_n_host_result
=
std
::
make_unique
<
Tensor
<
float
>>
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
c_f32_g_m_n_device_result
=
std
::
make_unique
<
Tensor
<
float
>>
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
bf16_to_f32_
(
a_g_m_k
,
a_f32_g_m_k
);
bf16_to_f32_
(
b_g_k_n
,
b_f32_g_k_n
);
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
float
,
float
,
float
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_batched_gemm
.
MakeArgument
(
a_f32_g_m_k
,
b_f32_g_k_n
,
*
c_f32_g_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
}
else
{
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_batched_gemm
.
MakeArgument
(
a_g_m_k
,
b_g_k_n
,
c_g_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
auto
ref_argument
=
ref_batched_gemm
.
MakeArgument
(
a_g_m_k
,
b_g_k_n
,
c_g_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
}
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_g_m_k
.
mDesc
.
GetElementSpace
());
...
...
@@ -156,6 +225,102 @@ void profile_batched_gemm_impl(int do_verification,
add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances
(
gemm_ptrs
);
}
}
else
if
constexpr
(
is_same
<
ADataType
,
bhalf_t
>::
value
&&
is_same
<
BDataType
,
bhalf_t
>::
value
&&
is_same
<
CDataType
,
bhalf_t
>::
value
)
{
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances
(
gemm_ptrs
);
}
}
else
if
constexpr
(
is_same
<
ADataType
,
float
>::
value
&&
is_same
<
BDataType
,
float
>::
value
&&
is_same
<
CDataType
,
float
>::
value
)
{
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances
(
gemm_ptrs
);
}
}
else
if
constexpr
(
is_same
<
ADataType
,
int8_t
>::
value
&&
is_same
<
BDataType
,
int8_t
>::
value
&&
is_same
<
CDataType
,
int8_t
>::
value
)
{
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances
(
gemm_ptrs
);
}
}
if
(
gemm_ptrs
.
size
()
<=
0
)
{
...
...
@@ -218,7 +383,20 @@ void profile_batched_gemm_impl(int do_verification,
{
c_device_buf
.
FromDevice
(
c_g_m_n_device_result
.
mData
.
data
());
check_error
(
c_g_m_n_host_result
,
c_g_m_n_device_result
);
if
constexpr
(
is_same
<
ADataType
,
ck
::
bhalf_t
>::
value
&&
is_same
<
BDataType
,
ck
::
bhalf_t
>::
value
&&
is_same
<
CDataType
,
ck
::
bhalf_t
>::
value
)
{
bf16_to_f32_
(
c_g_m_n_device_result
,
*
c_f32_g_m_n_device_result
);
float
err
=
check_error
(
*
c_f32_g_m_n_host_result
,
*
c_f32_g_m_n_device_result
);
pass
=
pass
&&
(
err
<
1E-6
);
}
else
{
float
err
=
check_error
(
c_g_m_n_host_result
,
c_g_m_n_device_result
);
pass
=
pass
&&
(
err
<
1E-6
);
}
if
(
do_log
)
{
...
...
@@ -241,6 +419,8 @@ void profile_batched_gemm_impl(int do_verification,
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
return
pass
;
}
}
// namespace profiler
...
...
profiler/include/profile_batched_gemm_reduce_impl.hpp
0 → 100644
View file @
dd6a8de4
#pragma once
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_conv.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp"
#include "element_wise_reduce_operation.hpp"
#include "device_gemm_reduce.hpp"
#include "reference_batched_gemm.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_gemm_instance
{
using
DeviceGemmReduceNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmReducePtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
ReduceSum
,
ck
::
tensor_operation
::
element_wise
::
ReduceSquareSum
>
;
void
add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmReduceNoOpPtr
>&
);
void
add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmReduceNoOpPtr
>&
);
void
add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmReduceNoOpPtr
>&
);
void
add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances
(
std
::
vector
<
DeviceGemmReduceNoOpPtr
>&
);
}
// namespace device_gemm_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
profiler
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
DDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
bool
profile_batched_gemm_reduce_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
int
M
,
int
N
,
int
K
,
int
StrideA
,
int
StrideB
,
int
StrideC
,
int
BatchCount
)
{
bool
pass
=
true
;
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count
,
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
row
*
stride
,
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
col
*
stride
,
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_g_m_k
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
K
,
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_g_k_n
(
f_host_tensor_descriptor
(
BatchCount
,
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_g_m_n_host_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
DDataType
>
d0_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
DDataType
>
d1_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
DDataType
>
d0_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
DDataType
>
d1_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_g_m_n: "
<<
c_g_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d0_g_m: "
<<
d0_g_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d1_g_m: "
<<
d1_g_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
std
::
srand
(
0
);
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
},
num_thread
);
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
std
::
srand
(
0
);
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
},
num_thread
);
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
},
num_thread
);
}
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSum
;
using
D1ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSquareSum
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
c_element_op
=
CElementOp
{};
const
auto
d0_reduce_op
=
D0ReduceOp
{};
const
auto
d1_reduce_op
=
D1ReduceOp
{};
if
(
do_verification
)
{
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_batched_gemm
.
MakeArgument
(
a_g_m_k
,
b_g_k_n
,
c_g_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
for
(
int
batch
=
0
;
batch
<
BatchCount
;
++
batch
)
{
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
float
d0_acc
=
d0_reduce_op
.
GetReduceZeroValue
();
float
d1_acc
=
d1_reduce_op
.
GetReduceZeroValue
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
d0_reduce_op
.
Reduce
(
d0_acc
,
c_g_m_n_host_result
(
batch
,
m
,
n
));
d1_reduce_op
.
Reduce
(
d1_acc
,
c_g_m_n_host_result
(
batch
,
m
,
n
));
}
d0_g_m_host_result
(
batch
,
m
)
=
d0_acc
;
d1_g_m_host_result
(
batch
,
m
)
=
d1_acc
;
}
}
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_g_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_g_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_g_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
d0_device_buf
(
sizeof
(
DDataType
)
*
d0_g_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
d1_device_buf
(
sizeof
(
DDataType
)
*
d1_g_m_device_result
.
mDesc
.
GetElementSpace
());
a_device_buf
.
ToDevice
(
a_g_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_g_k_n
.
mData
.
data
());
// add device GEMM instances
std
::
vector
<
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
DeviceGemmReduceNoOpPtr
>
gemm_ptrs
;
if
constexpr
(
is_same
<
ADataType
,
half_t
>::
value
&&
is_same
<
BDataType
,
half_t
>::
value
&&
is_same
<
CDataType
,
half_t
>::
value
)
{
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances
(
gemm_ptrs
);
}
}
if
(
gemm_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device GEMM instance found"
);
}
std
::
string
best_gemm_name
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device GEMM instances
for
(
auto
&
gemm_ptr
:
gemm_ptrs
)
{
auto
argument_ptr
=
gemm_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DDataType
*>
(
d0_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DDataType
*>
(
d1_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
,
d0_reduce_op
,
d1_reduce_op
,
BatchCount
);
auto
invoker_ptr
=
gemm_ptr
->
MakeInvokerPointer
();
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
// warm up
invoker_ptr
->
Run
(
argument_ptr
.
get
());
// timing
float
total_time
=
0
;
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
// init DO, D1 to 0
d0_device_buf
.
SetZero
();
d1_device_buf
.
SetZero
();
KernelTimer
timer
;
timer
.
Start
();
invoker_ptr
->
Run
(
argument_ptr
.
get
());
timer
.
End
();
total_time
+=
timer
.
GetElapsedTime
();
}
float
ave_time
=
total_time
/
nrepeat
;
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
BatchCount
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
BatchCount
*
M
*
K
+
sizeof
(
BDataType
)
*
BatchCount
*
K
*
N
+
sizeof
(
CDataType
)
*
BatchCount
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_gemm_name
=
gemm_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
c_g_m_n_device_result
.
mData
.
data
());
d0_device_buf
.
FromDevice
(
d0_g_m_device_result
.
mData
.
data
());
d1_device_buf
.
FromDevice
(
d1_g_m_device_result
.
mData
.
data
());
float
c_error
=
check_error
(
c_g_m_n_host_result
,
c_g_m_n_device_result
);
float
d0_error
=
check_error
(
d0_g_m_host_result
,
d0_g_m_device_result
);
float
d1_error
=
check_error
(
d1_g_m_host_result
,
d1_g_m_device_result
);
pass
=
pass
&&
(
c_error
<
1E-6
);
pass
=
pass
&&
(
d0_error
<
1E-6
);
pass
=
pass
&&
(
d1_error
<
1E-6
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a_g_m_k
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b_g_k_n
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_host: "
,
c_g_m_n_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_device: "
,
c_g_m_n_device_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"d0_host: "
,
d0_g_m_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"d0_device: "
,
d0_g_m_device_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"d1_host: "
,
d1_g_m_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"d1_device: "
,
d1_g_m_device_result
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
else
{
std
::
cout
<<
"does not support this GEMM problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_conv_bwd_data_impl.hpp
View file @
dd6a8de4
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -11,7 +13,7 @@
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
BF16
=
ushor
t
;
using
BF16
=
ck
::
bhalf_
t
;
using
INT8
=
int8_t
;
namespace
ck
{
namespace
tensor_operation
{
...
...
@@ -42,6 +44,7 @@ template <int NDimSpatial,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
...
...
@@ -123,6 +126,7 @@ void profile_conv_bwd_data_impl(int do_verification,
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
...
...
@@ -172,9 +176,9 @@ void profile_conv_bwd_data_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ushor
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ushor
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ushor
t
>
)
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
bhalf_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
bhalf_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
bhalf_
t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
...
...
@@ -251,7 +255,8 @@ void profile_conv_bwd_data_impl(int do_verification,
{
in_device_buf
.
FromDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
check_error
(
in_n_c_hi_wi_host_result
,
in_n_c_hi_wi_device_result
);
ck
::
utils
::
check_err
(
in_n_c_hi_wi_device_result
.
mData
,
in_n_c_hi_wi_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_bwd_weight_impl.hpp
0 → 100644
View file @
dd6a8de4
#pragma once
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_conv_backward_weight.hpp"
#include "element_wise_operation.hpp"
#include "reference_conv_backward_weight.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv2d_bwd_weight_instance
{
using
DeviceConvBwdWeightNoOpPtr
=
DeviceConvBwdWeightPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
void
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvBwdWeightNoOpPtr
>&
);
void
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvBwdWeightNoOpPtr
>&
);
}
// namespace device_conv2d_bwd_weight_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
profiler
{
template
<
int
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
bool
profile_conv_bwd_weight_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
ck
::
index_t
split_k
)
{
const
ck
::
index_t
Y
=
filter_spatial_lengths
[
0
];
const
ck
::
index_t
X
=
filter_spatial_lengths
[
1
];
const
ck
::
index_t
Hi
=
input_spatial_lengths
[
0
];
const
ck
::
index_t
Wi
=
input_spatial_lengths
[
1
];
const
ck
::
index_t
Ho
=
output_spatial_lengths
[
0
];
const
ck
::
index_t
Wo
=
output_spatial_lengths
[
1
];
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
if
constexpr
(
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
KCYX
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NKHW
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
H
*
W
,
W
,
1
}));
}
else
if
constexpr
(
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
KYXC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWK
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
}
};
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
WeiDataType
>
wei_k_c_y_x_host_result
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
,
WeiLayout
{}));
Tensor
<
WeiDataType
>
wei_k_c_y_x_device_result
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
,
WeiLayout
{}));
Tensor
<
OutDataType
>
out_n_k_ho_wo
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_k_ho_wo: "
<<
out_n_k_ho_wo
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
out_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
break
;
default:
out_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{
1
});
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
1
});
}
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{};
if
(
do_verification
)
{
using
ReferenceConvBwdWeightInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdWeight
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
auto
ref_conv
=
ReferenceConvBwdWeightInstance
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in_n_c_hi_wi
,
wei_k_c_y_x_host_result
,
out_n_k_ho_wo
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
ref_invoker
.
Run
(
ref_argument
);
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei_k_c_y_x_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
out_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceConvBwdWeightNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvBwdWeightPtr
<
PassThrough
,
PassThrough
,
PassThrough
>
;
// add device Conv instances
std
::
vector
<
DeviceConvBwdWeightNoOpPtr
>
conv_ptrs
;
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
float
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_weight_instance
::
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
half_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_weight_instance
::
add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
if
(
conv_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
std
::
string
best_conv_name
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device Conv instances
bool
pass
=
true
;
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
// using atomic, so need to reset input
if
(
split_k
>
1
)
{
wei_device_buf
.
SetZero
();
}
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
N
,
K
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
,
split_k
);
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
string
conv_name
=
conv_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
std
::
size_t
num_btype
=
sizeof
(
InDataType
)
*
(
N
*
C
*
Hi
*
Wi
)
+
sizeof
(
WeiDataType
)
*
(
K
*
C
*
Y
*
X
)
+
sizeof
(
OutDataType
)
*
(
N
*
K
*
Ho
*
Wo
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_conv_name
=
conv_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
wei_device_buf
.
FromDevice
(
wei_k_c_y_x_device_result
.
mData
.
data
());
float
max_error
=
check_error
(
wei_k_c_y_x_host_result
,
wei_k_c_y_x_device_result
);
if
(
max_error
>
8
)
{
pass
=
false
;
std
::
cout
<<
"Fail info:"
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out: "
,
out_n_k_ho_wo
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in_n_c_hi_wi
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_host : "
,
wei_k_c_y_x_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_device: "
,
wei_k_c_y_x_device_result
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_conv_name
<<
std
::
endl
;
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
View file @
dd6a8de4
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -245,7 +247,8 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
View file @
dd6a8de4
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -301,7 +302,8 @@ void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
View file @
dd6a8de4
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -233,7 +234,8 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_impl.hpp
View file @
dd6a8de4
#pragma once
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -253,7 +255,8 @@ void profile_conv_fwd_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
check_error
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
if
(
do_log
)
{
...
...
profiler/include/profile_convnd_bwd_data_impl.hpp
0 → 100644
View file @
dd6a8de4
#pragma once
#include "config.hpp"
#include "device.hpp"
#include "conv_fwd_util.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_conv_bwd_data.hpp"
#include "element_wise_operation.hpp"
#include "reference_conv_bwd_data.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
BF16
=
ck
::
bhalf_t
;
using
INT8
=
int8_t
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv2d_bwd_data_instance
{
using
DeviceConvBwdDataNoOpPtr
=
DeviceConvBwdDataPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
void
add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
}
// namespace device_conv2d_bwd_data_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
profiler
{
using
DeviceConvBwdDataNoOpPtr
=
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
DeviceConvBwdDataNoOpPtr
;
template
<
typename
InLayout
>
HostTensorDescriptor
get_input_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
InLayout
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
InLayout
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
InLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
typename
WeiLayout
>
HostTensorDescriptor
get_filters_host_tensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
WeiLayout
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
WeiLayout
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
WeiLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
typename
OutLayout
>
HostTensorDescriptor
get_output_host_ensor_descriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
OutLayout
{});
}
case
2
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
OutLayout
{});
}
case
1
:
{
return
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
OutLayout
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
template
<
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
>
void
get_device_conv_bwd_data_op_ptr
(
InDataType
,
WeiDataType
,
OutDataType
,
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
,
int
)
{
std
::
cout
<<
"can not find device conv bwd data"
<<
std
::
endl
;
exit
(
1
);
}
template
<
>
void
get_device_conv_bwd_data_op_ptr
(
F32
,
F32
,
F32
,
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
conv_ptrs
,
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
case
1
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances
(
conv_ptrs
);
break
;
case
2
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
break
;
case
3
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
conv_ptrs
);
break
;
default:
break
;
}
}
template
<
>
void
get_device_conv_bwd_data_op_ptr
(
F16
,
F16
,
F16
,
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
conv_ptrs
,
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
case
1
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances
(
conv_ptrs
);
break
;
case
2
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
break
;
case
3
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
conv_ptrs
);
break
;
default:
break
;
}
}
template
<
>
void
get_device_conv_bwd_data_op_ptr
(
BF16
,
BF16
,
BF16
,
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
conv_ptrs
,
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
case
1
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances
(
conv_ptrs
);
break
;
case
2
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
break
;
case
3
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
conv_ptrs
);
break
;
default:
break
;
}
}
template
<
>
void
get_device_conv_bwd_data_op_ptr
(
INT8
,
INT8
,
INT8
,
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
conv_ptrs
,
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
case
1
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances
(
conv_ptrs
);
break
;
case
2
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances
(
conv_ptrs
);
break
;
case
3
:
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
conv_ptrs
);
break
;
default:
break
;
}
}
template
<
typename
T
>
static
bool
check_out
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
float
max_diff
=
1e-6
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
{
return
false
;
}
}
return
true
;
}
template
<
typename
DataType
>
void
show_data_nhwc_layout
(
Tensor
<
DataType
>&
nhwc
)
{
std
::
cout
<<
"["
;
for
(
int
n
=
0
;
n
<
nhwc
.
mDesc
.
GetLengths
()[
0
];
n
++
)
{
std
::
cout
<<
"["
;
for
(
int
hi
=
0
;
hi
<
nhwc
.
mDesc
.
GetLengths
()[
2
];
hi
++
)
{
std
::
cout
<<
"["
;
for
(
int
wi
=
0
;
wi
<
nhwc
.
mDesc
.
GetLengths
()[
3
];
wi
++
)
{
std
::
cout
<<
"["
;
for
(
int
c
=
0
;
c
<
nhwc
.
mDesc
.
GetLengths
()[
1
];
c
++
)
{
std
::
cout
<<
static_cast
<
float
>
(
nhwc
(
n
,
c
,
hi
,
wi
))
<<
" "
;
}
std
::
cout
<<
"]"
;
}
std
::
cout
<<
"]"
;
}
std
::
cout
<<
"]"
;
}
std
::
cout
<<
"]"
;
}
template
<
int
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
bool
profile_convnd_bwd_data_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
const
std
::
vector
<
ck
::
index_t
>&
input_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
filter_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
conv_filter_strides
,
const
std
::
vector
<
ck
::
index_t
>&
conv_filter_dilations
,
const
std
::
vector
<
ck
::
index_t
>&
input_left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
input_right_pads
)
{
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{};
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
N
),
static_cast
<
std
::
size_t
>
(
C
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
input_spatial_lengths
),
std
::
end
(
input_spatial_lengths
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
K
),
static_cast
<
std
::
size_t
>
(
C
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
filter_spatial_lengths
),
std
::
end
(
filter_spatial_lengths
));
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
N
),
static_cast
<
std
::
size_t
>
(
K
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
input_host_result
(
get_input_host_tensor_descriptor
<
InLayout
>
(
input_dims
,
NDimSpatial
));
Tensor
<
InDataType
>
input_device_result
(
get_input_host_tensor_descriptor
<
InLayout
>
(
input_dims
,
NDimSpatial
));
Tensor
<
WeiDataType
>
weights
(
get_filters_host_tensor_descriptor
<
WeiLayout
>
(
filter_dims
,
NDimSpatial
));
Tensor
<
OutDataType
>
output
(
get_output_host_ensor_descriptor
<
OutLayout
>
(
output_dims
,
NDimSpatial
));
std
::
cout
<<
"input: "
<<
input_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"weights: "
<<
weights
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
output
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
weights
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
default:
output
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{
1
});
weights
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
out_device_buf
.
ToDevice
(
output
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
// reset input to zero
in_device_buf
.
SetZero
();
if
(
do_verification
)
{
auto
RunReference
=
[
&
](
auto
&
ref_conv
)
{
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input_host_result
,
weights
,
output
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
};
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
NDimSpatial
>
();
RunReference
(
ref_conv
);
}
// add device Conv instances
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>
conv_ptrs
;
get_device_conv_bwd_data_op_ptr
(
InDataType
{},
WeiDataType
{},
OutDataType
{},
conv_ptrs
,
NDimSpatial
);
if
(
conv_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
std
::
string
best_conv_name
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device Conv instances
bool
success
=
true
;
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
N
,
K
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
string
conv_name
=
conv_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
flop
=
ck
::
utils
::
conv
::
get_flops
(
N
,
C
,
K
,
filter_spatial_lengths
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
ck
::
utils
::
conv
::
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
N
,
C
,
K
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_conv_name
=
conv_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
in_device_buf
.
FromDevice
(
input_device_result
.
mData
.
data
());
if
(
!
check_out
(
input_host_result
,
input_device_result
))
{
std
::
cout
<<
"Fail Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
success
=
false
;
}
else
{
std
::
cout
<<
"Pass Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
check_error
(
input_host_result
,
input_device_result
);
if
(
do_log
)
{
std
::
cout
<<
"in : "
;
show_data_nhwc_layout
(
output
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"wei: "
;
show_data_nhwc_layout
(
weights
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"out_host : "
;
show_data_nhwc_layout
(
input_host_result
);
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"out_device: "
;
show_data_nhwc_layout
(
input_device_result
);
std
::
cout
<<
std
::
endl
;
}
}
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_conv_name
<<
std
::
endl
;
return
success
;
}
}
// namespace profiler
}
// namespace ck
Prev
1
…
16
17
18
19
20
21
22
23
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment