Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9dce6851
Commit
9dce6851
authored
Mar 10, 2022
by
Jing Zhang
Browse files
merge develop
parents
3cc57101
5d37d7bf
Changes
473
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1473 additions
and
34 deletions
+1473
-34
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
...ice_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
+43
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
...ice_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
+22
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
...ice_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
+22
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
...ice_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
+22
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
...reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
+34
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
...reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
+25
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
...reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
+38
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
...reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
+19
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
...reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
+46
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
.../reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
+34
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
.../reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
+25
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
.../reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
+43
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
.../reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
+25
-0
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
.../reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
+43
-0
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+24
-14
profiler/include/profile_conv_bwd_data_impl.hpp
profiler/include/profile_conv_bwd_data_impl.hpp
+278
-0
profiler/include/profile_conv_fwd_impl.hpp
profiler/include/profile_conv_fwd_impl.hpp
+3
-3
profiler/include/profile_gemm_impl.hpp
profiler/include/profile_gemm_impl.hpp
+101
-17
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+626
-0
profiler/src/README.md
profiler/src/README.md
+0
-0
No files found.
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_blockwise_second_call.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
);
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
half_t
,
float
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
half_t
,
float
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
half_t
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_multiblock_atomic_add.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
//
// Will be moved to use MultiBlockAtomicAdd
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp
0 → 100644
View file @
9dce6851
#include "device_reduce_instance_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
//
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/CMakeLists.txt
View file @
9dce6851
include_directories
(
BEFORE
include
${
PROJECT_SOURCE_DIR
}
/host/host_tensor/include
${
PROJECT_SOURCE_DIR
}
/device/include
${
PROJECT_SOURCE_DIR
}
/device_operation/include
${
PROJECT_SOURCE_DIR
}
/reference_operation/include
${
PROJECT_SOURCE_DIR
}
/include/ck
${
PROJECT_SOURCE_DIR
}
/include/ck/utility
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_description
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor
${
PROJECT_SOURCE_DIR
}
/include/ck/problem_transform
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/device
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/grid
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/block
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/warp
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/thread
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/element
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/tensor_operation_instance
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/tensor_operation_instance/gpu/reduce
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_description
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_operation
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/problem_transform
${
PROJECT_SOURCE_DIR
}
/external/rocm/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
# ck_profiler
...
...
@@ -20,22 +26,26 @@ set(PROFILER_SOURCE
src/profile_gemm_bias_2d.cpp
src/profile_gemm_bias_relu.cpp
src/profile_gemm_bias_relu_add.cpp
src/profile_batched_gemm.cpp
src/profile_conv_fwd.cpp
src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_fwd_bias_relu_atomic_add.cpp
src/profile_batched_gemm.cpp
src/profile_conv_bwd_data.cpp
src/profile_reduce.cpp
)
add_executable
(
ckProfiler
${
PROFILER_SOURCE
}
)
target_link_libraries
(
ckProfiler PRIVATE host_tensor
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bias
_
2d_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bias2d_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bias_relu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
profiler/include/profile_conv_bwd_data_impl.hpp
0 → 100644
View file @
9dce6851
#pragma once
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_conv_bwd_data.hpp"
#include "element_wise_operation.hpp"
#include "reference_conv_bwd_data.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
BF16
=
ushort
;
using
INT8
=
int8_t
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv2d_bwd_data_instance
{
using
DeviceConvBwdDataNoOpPtr
=
DeviceConvBwdDataPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
void
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
void
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>&
);
}
// namespace device_conv2d_bwd_data_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
profiler
{
template
<
int
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
void
profile_conv_bwd_data_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
{
const
ck
::
index_t
Y
=
filter_spatial_lengths
[
0
];
const
ck
::
index_t
X
=
filter_spatial_lengths
[
1
];
const
ck
::
index_t
Hi
=
input_spatial_lengths
[
0
];
const
ck
::
index_t
Wi
=
input_spatial_lengths
[
1
];
const
ck
::
index_t
Ho
=
output_spatial_lengths
[
0
];
const
ck
::
index_t
Wo
=
output_spatial_lengths
[
1
];
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
if
constexpr
(
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
KCYX
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NKHW
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
H
*
W
,
W
,
1
}));
}
else
if
constexpr
(
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
KYXC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWK
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
}
};
Tensor
<
InDataType
>
in_n_c_hi_wi_host_result
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
InDataType
>
in_n_c_hi_wi_device_result
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
WeiDataType
>
wei_k_c_y_x
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
,
WeiLayout
{}));
Tensor
<
OutDataType
>
out_n_k_ho_wo
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_k_ho_wo: "
<<
out_n_k_ho_wo
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
out_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
default:
out_n_k_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
}
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{};
if
(
do_verification
)
{
using
ReferenceConvBwdDataInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
auto
ref_conv
=
ReferenceConvBwdDataInstance
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in_n_c_hi_wi_host_result
,
wei_k_c_y_x
,
out_n_k_ho_wo
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
ref_invoker
.
Run
(
ref_argument
);
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
out_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceConvBwdDataNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvBwdDataPtr
<
PassThrough
,
PassThrough
,
PassThrough
>
;
// add device Conv instances
std
::
vector
<
DeviceConvBwdDataNoOpPtr
>
conv_ptrs
;
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
float
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
half_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ushort
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ushort
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ushort
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
int8_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
int8_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
int8_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances
(
conv_ptrs
);
}
if
(
conv_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
std
::
string
best_conv_name
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device Conv instances
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
N
,
K
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
string
conv_name
=
conv_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
std
::
size_t
num_btype
=
sizeof
(
InDataType
)
*
(
N
*
C
*
Hi
*
Wi
)
+
sizeof
(
WeiDataType
)
*
(
K
*
C
*
Y
*
X
)
+
sizeof
(
OutDataType
)
*
(
N
*
K
*
Ho
*
Wo
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_conv_name
=
conv_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
in_device_buf
.
FromDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
check_error
(
in_n_c_hi_wi_host_result
,
in_n_c_hi_wi_device_result
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
out_n_k_ho_wo
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei: "
,
wei_k_c_y_x
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
in_n_c_hi_wi_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
in_n_c_hi_wi_device_result
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_conv_name
<<
std
::
endl
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_conv_fwd_impl.hpp
View file @
9dce6851
...
...
@@ -174,9 +174,9 @@ void profile_conv_fwd_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ushor
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ushor
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ushor
t
>
)
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
bhalf_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
bhalf_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
bhalf_
t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
...
...
profiler/include/profile_gemm_impl.hpp
View file @
9dce6851
...
...
@@ -26,11 +26,17 @@ void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(std::vector<DeviceGemmNo
void
add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
...
...
@@ -91,12 +97,11 @@ void profile_gemm_impl(int do_verification,
Tensor
<
ADataType
>
a_m_k
(
f_host_tensor_descriptor
(
M
,
K
,
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_
host
_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_
device
_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
switch
(
init_method
)
...
...
@@ -122,19 +127,10 @@ void profile_gemm_impl(int do_verification,
const
auto
b_element_op
=
BElementOp
{};
const
auto
c_element_op
=
CElementOp
{};
if
(
do_verification
)
{
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
// if(do_verification)
// {
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
}
// }
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
...
...
@@ -290,6 +286,29 @@ void profile_gemm_impl(int do_verification,
}
}
}
else
if
constexpr
(
is_same
<
ADataType
,
ck
::
bhalf_t
>::
value
&&
is_same
<
BDataType
,
ck
::
bhalf_t
>::
value
&&
is_same
<
CDataType
,
ck
::
bhalf_t
>::
value
)
{
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances
(
gemm_ptrs
);
}
}
else
if
constexpr
(
is_same
<
ADataType
,
int8_t
>::
value
&&
is_same
<
BDataType
,
int8_t
>::
value
&&
is_same
<
CDataType
,
int8_t
>::
value
)
{
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances
(
gemm_ptrs
);
}
}
if
(
gemm_ptrs
.
size
()
<=
0
)
{
...
...
@@ -351,14 +370,79 @@ void profile_gemm_impl(int do_verification,
{
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
check_error
(
c_m_n_host_result
,
c_m_n_device_result
);
if
constexpr
(
is_same
<
ADataType
,
ck
::
bhalf_t
>::
value
&&
is_same
<
BDataType
,
ck
::
bhalf_t
>::
value
&&
is_same
<
CDataType
,
ck
::
bhalf_t
>::
value
)
{
Tensor
<
float
>
a_f32_m_k
(
f_host_tensor_descriptor
(
M
,
K
,
StrideA
,
ALayout
{}));
Tensor
<
float
>
b_f32_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
float
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
float
>
c_m_n_device_f32_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
bf16_to_f32_
(
a_m_k
,
a_f32_m_k
);
bf16_to_f32_
(
b_k_n
,
b_f32_k_n
);
bf16_to_f32_
(
c_m_n_device_result
,
c_m_n_device_f32_result
);
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
float
,
float
,
float
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_f32_m_k
,
b_f32_k_n
,
c_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
check_error
(
c_m_n_host_result
,
c_m_n_device_f32_result
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_host : "
,
c_m_n_host_result
.
mData
,
","
)
<<
std
::
endl
;
}
}
else
{
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
check_error
(
c_m_n_host_result
,
c_m_n_device_result
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_host : "
,
c_m_n_host_result
.
mData
,
","
)
<<
std
::
endl
;
}
}
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a_m_k
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b_k_n
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_host : "
,
c_m_n_host_result
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_device: "
,
c_m_n_device_result
.
mData
,
","
)
<<
std
::
endl
;
}
...
...
profiler/include/profile_reduce_impl.hpp
0 → 100644
View file @
9dce6851
#pragma once
#include "device_reduce.hpp"
#include "device_reduce_instance.hpp"
#include "reduction_enums.hpp"
#include "host_generic_reduction.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
template
<
int
Rank
,
typename
ReduceDims
,
int
ReduceOpId
,
int
NanOpt
,
int
IndicesOpt
>
struct
ReduceDescription
{
static
constexpr
int
Rank_
=
Rank
;
static
constexpr
int
ReduceOpId_
=
ReduceOpId
;
static
constexpr
int
NanOpt_
=
NanOpt
;
static
constexpr
int
IndicesOpt_
=
IndicesOpt
;
using
ReduceDims_
=
ReduceDims
;
};
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
0
,
0
,
0
>
,
// for ADD
ReduceDescription
<
4
,
Sequence
<
0
>
,
0
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
5
,
0
,
0
>
,
// for AVG
ReduceDescription
<
4
,
Sequence
<
0
>
,
5
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
7
,
0
,
0
>
,
// for NORM2
ReduceDescription
<
4
,
Sequence
<
0
>
,
7
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
2
,
0
,
0
>
,
// for MIN
ReduceDescription
<
4
,
Sequence
<
0
>
,
2
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
3
,
0
,
0
>
,
// for MAX
ReduceDescription
<
4
,
Sequence
<
0
>
,
3
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
4
,
0
,
0
>
,
// for AMAX
ReduceDescription
<
4
,
Sequence
<
0
>
,
4
,
0
,
0
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
2
,
0
,
1
>
,
// for MIN
ReduceDescription
<
4
,
Sequence
<
0
>
,
2
,
0
,
1
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
3
,
0
,
1
>
,
// for MAX
ReduceDescription
<
4
,
Sequence
<
0
>
,
3
,
0
,
1
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
Sequence
<
0
,
1
,
2
>
,
4
,
0
,
1
>
,
// for AMAX
ReduceDescription
<
4
,
Sequence
<
0
>
,
4
,
0
,
1
>
,
ReduceDescription
<
2
,
Sequence
<
1
>
,
4
,
0
,
1
>>
;
template
<
typename
DescriptionType
>
bool
description_match
(
const
DescriptionType
&
description
,
int
Rank
,
const
std
::
vector
<
int
>&
ReduceDims
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
)
{
if
(
description
.
Rank_
!=
Rank
||
description
.
ReduceOpId_
!=
static_cast
<
int
>
(
ReduceOpId
)
||
description
.
NanOpt_
!=
static_cast
<
int
>
(
NanOpt
)
||
description
.
IndicesOpt_
!=
static_cast
<
int
>
(
IndicesOpt
))
return
(
false
);
if
(
DescriptionType
::
ReduceDims_
::
Size
()
!=
ReduceDims
.
size
())
return
(
false
);
bool
result
=
true
;
static_for
<
0
,
DescriptionType
::
ReduceDims_
::
Size
(),
1
>
{}([
&
](
auto
i
)
{
if
(
DescriptionType
::
ReduceDims_
::
At
(
i
)
!=
ReduceDims
[
i
])
result
=
false
;
});
return
(
result
);
};
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
profiler
{
template
<
int
Rank
,
typename
ReduceDims
>
static
std
::
vector
<
int
>
get_reduce_dims
()
{
std
::
vector
<
int
>
resDims
;
static_for
<
0
,
ReduceDims
::
Size
(),
1
>
{}([
&
](
auto
i
)
{
resDims
.
push_back
(
ReduceDims
::
At
(
i
));
});
return
(
resDims
);
};
template
<
int
Rank
,
typename
ReduceDims
>
static
std
::
vector
<
int
>
get_invariant_dims
()
{
std
::
vector
<
int
>
resDims
;
unsigned
int
incFlag
=
0
;
static_for
<
0
,
ReduceDims
::
Size
(),
1
>
{}(
[
&
](
auto
i
)
{
incFlag
=
incFlag
|
(
0x1
<<
ReduceDims
::
At
(
i
));
});
for
(
int
dim
=
0
;
dim
<
Rank
;
dim
++
)
{
if
(
incFlag
&
(
0x1
<<
dim
))
continue
;
resDims
.
push_back
(
dim
);
};
return
(
resDims
);
};
template
<
typename
T
>
static
void
dumpBufferToFile
(
const
char
*
fileName
,
T
*
data
,
size_t
dataNumItems
)
{
std
::
ofstream
outFile
(
fileName
,
std
::
ios
::
binary
);
if
(
outFile
)
{
outFile
.
write
(
reinterpret_cast
<
char
*>
(
data
),
dataNumItems
*
sizeof
(
T
));
outFile
.
close
();
std
::
cout
<<
"Write output to file "
<<
fileName
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Could not open file "
<<
fileName
<<
" for writing"
<<
std
::
endl
;
}
};
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template
<
typename
inDataType
>
struct
type_mapping
{
using
outDataType
=
inDataType
;
};
template
<
>
struct
type_mapping
<
ck
::
half_t
>
{
using
outDataType
=
half_float
::
half
;
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
typename
ReduceDims_
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
>
void
profile_reduce_impl_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
int
nrepeat
,
const
std
::
vector
<
size_t
>&
inLengths
,
float
alpha
,
float
beta
)
{
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
host_reduce
;
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp_t
::
MIN
||
ReduceOpId
==
ReduceTensorOp_t
::
MAX
||
ReduceOpId
==
ReduceTensorOp_t
::
AMAX
);
constexpr
bool
NeedIndices
=
(
op_support_indices
&&
(
IndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
));
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation_t
::
PROPAGATE_NAN
);
constexpr
bool
out_support_atomic_add
=
std
::
is_same
<
OutDataType
,
float
>::
value
;
constexpr
bool
op_support_atomic_add
=
!
op_support_indices
&&
ReduceOpId
!=
ReduceTensorOp_t
::
NORM2
;
constexpr
bool
use_atomic_add
=
(
out_support_atomic_add
&&
op_support_atomic_add
);
// 1) If InDataType is half_t, must use half_t as AccDataType for indexable reduction operations
// 2) If InDataType is half_t, must use float as AccDataType for non-indexable reduction
// operations
constexpr
bool
invalid_reduce_1
=
std
::
is_same
<
InDataType
,
half_t
>::
value
&&
((
!
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
)
||
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
half_t
>::
value
));
// 1) If InDataType is float, must use float as AccDataType for indexable reduction operations
constexpr
bool
invalid_reduce_2
=
std
::
is_same
<
InDataType
,
float
>::
value
&&
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
// 1) The indices can only be used when the reduction operation is indexable
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
IndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
);
if
constexpr
(
!
invalid_reduce
)
{
Tensor
<
InDataType
>
in
(
inLengths
);
const
std
::
vector
<
int
>
OuterDims
=
get_invariant_dims
<
Rank
,
ReduceDims_
>
();
const
std
::
vector
<
int
>
ReduceDims
=
get_reduce_dims
<
Rank
,
ReduceDims_
>
();
std
::
vector
<
size_t
>
outLengths
;
if
(
OuterDims
.
empty
())
outLengths
.
push_back
(
1
);
else
for
(
auto
dim
:
OuterDims
)
outLengths
.
push_back
(
inLengths
[
dim
]);
Tensor
<
OutDataType
>
out_ref
(
outLengths
);
Tensor
<
OutDataType
>
out
(
outLengths
);
Tensor
<
int
>
out_indices_ref
(
outLengths
);
Tensor
<
int
>
out_indices
(
outLengths
);
auto
inStrides
=
in
.
mDesc
.
GetStrides
();
auto
outStrides
=
out
.
mDesc
.
GetStrides
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
if
(
do_verification
)
{
switch
(
init_method
)
{
case
0
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{},
num_thread
);
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
1
,
5
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
1
,
5
},
num_thread
);
}
if
(
beta
!=
0.0
f
)
for
(
size_t
i
=
0
;
i
<
out_ref
.
mDesc
.
GetElementSpace
();
i
++
)
out
.
mData
[
i
]
=
out_ref
.
mData
[
i
];
};
// these buffers are usually provided by the user application
DeviceMem
in_dev
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
out_dev
(
sizeof
(
OutDataType
)
*
out
.
mDesc
.
GetElementSpace
());
in_dev
.
ToDevice
(
in
.
mData
.
data
());
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
size_t
indicesSizeInBytes
=
NeedIndices
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int
)
:
0
;
DeviceMem
out_indices_dev
(
indicesSizeInBytes
);
float
best_avg_time
=
0
;
float
best_gb_per_sec
=
0
;
using
InElementwiseOperation_0
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation_0
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
InElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
InElementwiseOperation
;
using
AccElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
;
using
InElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
;
using
DeviceReduceInstPtr0
=
DeviceReducePtr
<
InElementwiseOperation_0
,
AccElementwiseOperation_0
>
;
using
DeviceReduceInstPtr1
=
DeviceReducePtr
<
InElementwiseOperation_1
,
AccElementwiseOperation_1
>
;
using
DeviceReduceInstPtr2
=
DeviceReducePtr
<
InElementwiseOperation_2
,
AccElementwiseOperation_2
>
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
std
::
vector
<
DeviceReduceInstPtr1
>
reduce1_ptrs
;
std
::
vector
<
DeviceReduceInstPtr2
>
reduce2_ptrs
;
add_device_reduce_instance_threadwise
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
ReduceDims_
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce0_ptrs
);
add_device_reduce_instance_blockwise
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
ReduceDims_
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce0_ptrs
);
if
constexpr
(
use_atomic_add
)
add_device_reduce_instance_multiblock_atomic_add
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
ReduceDims_
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce0_ptrs
);
else
add_device_reduce_instance_multiblock_partial_reduce
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
ReduceDims_
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce1_ptrs
);
// used for secondary reduction
if
constexpr
(
!
use_atomic_add
)
add_device_reduce_instance_blockwise_second_call
<
AccDataType
,
AccDataType
,
OutDataType
,
Rank
,
ReduceDims_
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce2_ptrs
);
if
(
reduce0_ptrs
.
empty
()
&&
reduce1_ptrs
.
empty
())
{
throw
std
::
runtime_error
(
"Wrong! No device REDUCE instance found"
);
};
if
(
do_verification
)
{
using
hInType
=
typename
type_mapping
<
InDataType
>::
outDataType
;
using
hOutType
=
typename
type_mapping
<
OutDataType
>::
outDataType
;
using
hCompType
=
typename
type_mapping
<
AccDataType
>::
outDataType
;
ReductionHost
<
hInType
,
hCompType
,
hOutType
,
ReduceOpId
,
PropagateNan
,
NeedIndices
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
OuterDims
,
ReduceDims
);
hostReduce
.
Run
(
alpha
,
reinterpret_cast
<
const
hInType
*>
(
in
.
mData
.
data
()),
beta
,
reinterpret_cast
<
hOutType
*>
(
out_ref
.
mData
.
data
()),
out_indices_ref
.
mData
.
data
());
};
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
const
auto
i_inStrides
=
to_int_vector
(
inStrides
);
const
auto
i_outLengths
=
to_int_vector
(
outLengths
);
const
auto
i_outStrides
=
to_int_vector
(
outStrides
);
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
{
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_outLengths
,
i_outStrides
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
InElementwiseOperation_0
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
AccElementwiseOperation_0
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
std
::
string
reduce_name
=
reduce_ptr
->
GetTypeString
();
auto
invoker_ptr
=
reduce_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
sizeof
(
OutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
{
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
check_error
(
out_ref
,
out
);
if
(
NeedIndices
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
check_indices
(
out_indices_ref
,
out_indices
);
};
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
};
};
if
(
do_dumpout
)
{
dumpBufferToFile
(
"dump_in.bin"
,
in
.
mData
.
data
(),
in
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
{
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
out_indices
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_indices_host.bin"
,
out_indices_ref
.
mData
.
data
(),
out_indices_ref
.
mDesc
.
GetElementSize
());
};
};
};
for
(
auto
&
reduce_ptr
:
reduce1_ptrs
)
{
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_outLengths
,
i_outStrides
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
InElementwiseOperation_1
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
AccElementwiseOperation_1
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
std
::
string
reduce_name
=
reduce_ptr
->
GetTypeString
();
auto
invoker_ptr
=
reduce_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
sizeof
(
OutDataType
);
std
::
vector
<
int
>
inLengths2
=
reduce_ptr
->
GetWorkspace2dLengths
(
argument_ptr
.
get
());
std
::
vector
<
int
>
inStrides2
{
inLengths2
[
1
],
1
};
for
(
auto
&
reduce2_ptr
:
reduce2_ptrs
)
{
auto
argument2_ptr
=
reduce2_ptr
->
MakeArgumentPointer
(
inLengths2
,
inStrides2
,
i_outLengths
,
i_outStrides
,
alpha
,
beta
,
ws_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
InElementwiseOperation_2
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
AccElementwiseOperation_2
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
if
(
!
reduce2_ptr
->
IsSupportedArgument
(
argument2_ptr
.
get
()))
continue
;
std
::
string
reduce2_name
=
reduce2_ptr
->
GetTypeString
();
auto
invoker2_ptr
=
reduce2_ptr
->
MakeInvokerPointer
();
float
avg_time_2
=
invoker2_ptr
->
Run
(
argument2_ptr
.
get
(),
nrepeat
);
std
::
size_t
num_bytes_2
=
static_cast
<
size_t
>
(
inLengths2
[
0
])
*
inLengths2
[
1
]
*
sizeof
(
AccDataType
);
float
gb_per_sec
=
(
num_bytes
+
num_bytes_2
)
/
1.E6
/
(
avg_time
+
avg_time_2
);
std
::
cout
<<
"Perf: "
<<
(
avg_time
+
avg_time_2
)
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
" => "
<<
reduce2_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
{
best_avg_time
=
avg_time
+
avg_time_2
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
check_error
(
out_ref
,
out
);
if
(
NeedIndices
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
check_indices
(
out_indices_ref
,
out_indices
);
};
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
}
if
(
do_dumpout
)
{
dumpBufferToFile
(
"dump_in.bin"
,
in
.
mData
.
data
(),
in
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
{
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
out_indices
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_indices_host.bin"
,
out_indices_ref
.
mData
.
data
(),
out_indices_ref
.
mDesc
.
GetElementSize
());
};
};
};
};
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"The requested reduction operation is not supported, please check !!!"
<<
std
::
endl
;
};
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
>
void
profile_reduce_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
int
nrepeat
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
ReduceDims
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
,
float
alpha
,
float
beta
)
{
bool
matched
=
false
;
using
tuple_of_description_instances
=
tensor_operation
::
device
::
device_reduce_instance
::
reduce_description_instances
;
const
auto
tuple_object
=
tuple_of_description_instances
{};
static_for
<
0
,
std
::
tuple_size
<
tuple_of_description_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
if
(
matched
)
return
;
using
descType
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
>
(
tuple_object
))
>
;
if
(
!
description_match
(
descType
{},
inLengths
.
size
(),
ReduceDims
,
ReduceOpId
,
NanOpt
,
IndicesOpt
))
return
;
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
OutDataType
,
descType
::
Rank_
,
typename
descType
::
ReduceDims_
,
static_cast
<
ReduceTensorOp_t
>
(
descType
::
ReduceOpId_
),
static_cast
<
NanPropagation_t
>
(
descType
::
NanOpt_
),
static_cast
<
ReduceTensorIndices_t
>
(
descType
::
IndicesOpt_
)
>
(
do_verification
,
init_method
,
do_log
,
do_dumpout
,
nrepeat
,
inLengths
,
alpha
,
beta
);
matched
=
true
;
});
};
}
// namespace profiler
}
// namespace ck
profiler/README.md
→
profiler/
src/
README.md
View file @
9dce6851
File moved
Prev
1
…
18
19
20
21
22
23
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment