Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b7a6f810
Commit
b7a6f810
authored
Mar 10, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into fix_threadwise_copy_error_in_reduction
parents
b29dfd70
827301d9
Changes
70
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
367 additions
and
365 deletions
+367
-365
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
...u/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
+27
-27
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
...u/reduce/device_reduce_instance_blockwise_second_call.hpp
+33
-33
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
...ice_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
+19
-19
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
...ice_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
...ice_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
+27
-27
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
...ice_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
...ice_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
+27
-27
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
...u/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+19
-19
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
+6
-6
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+6
-6
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
...ice_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+6
-6
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
...duce/device_reduce_instance_multiblock_partial_reduce.hpp
+34
-35
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
...reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
+19
-19
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
...reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
...reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
+22
-22
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
...reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
+4
-4
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
...reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
+29
-29
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
...instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+34
-31
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
.../reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+19
-19
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
.../reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
+9
-9
No files found.
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
View file @
b7a6f810
...
@@ -11,34 +11,34 @@ namespace device {
...
@@ -11,34 +11,34 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call.hpp
View file @
b7a6f810
...
@@ -45,7 +45,7 @@ template <typename InDataType,
...
@@ -45,7 +45,7 @@ template <typename InDataType,
typename
AccDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
typename
ReduceDim
s
,
int
Num
ReduceDim
,
ReduceTensorOp_t
ReduceOpId
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
>
ReduceTensorIndices_t
IndicesOpt
>
...
@@ -86,7 +86,7 @@ void add_device_reduce_instance_blockwise_second_call(
...
@@ -86,7 +86,7 @@ void add_device_reduce_instance_blockwise_second_call(
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
ReduceDim
s
,
Num
ReduceDim
,
ReduceOperation
,
ReduceOperation
,
InElementwiseOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
AccElementwiseOperation
,
...
@@ -107,12 +107,12 @@ void add_device_reduce_instance_blockwise_second_call(
...
@@ -107,12 +107,12 @@ void add_device_reduce_instance_blockwise_second_call(
};
};
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE( \
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
template void add_device_reduce_instance_blockwise_second_call<inT, \
template void add_device_reduce_instance_blockwise_second_call<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
Sequence<__VA_ARGS__>
, \
NumReduceDim
, \
ReduceOpId, \
ReduceOpId, \
NanOpt, \
NanOpt, \
IndicesOpt>( \
IndicesOpt>( \
...
@@ -120,7 +120,7 @@ void add_device_reduce_instance_blockwise_second_call(
...
@@ -120,7 +120,7 @@ void add_device_reduce_instance_blockwise_second_call(
device_op_instances)
device_op_instances)
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
...
@@ -128,15 +128,15 @@ void add_device_reduce_instance_blockwise_second_call(
...
@@ -128,15 +128,15 @@ void add_device_reduce_instance_blockwise_second_call(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
extern template void add_device_reduce_instance_blockwise_second_call<inT, \
extern template void add_device_reduce_instance_blockwise_second_call<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
Sequence<__VA_ARGS__>,
\
NumReduceDim,
\
ReduceOpId, \
ReduceOpId, \
NanOpt, \
NanOpt, \
IndicesOpt>( \
IndicesOpt>( \
...
@@ -148,7 +148,7 @@ void add_device_reduce_instance_blockwise_second_call(
...
@@ -148,7 +148,7 @@ void add_device_reduce_instance_blockwise_second_call(
device_op_instances)
device_op_instances)
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
...
@@ -156,7 +156,7 @@ void add_device_reduce_instance_blockwise_second_call(
...
@@ -156,7 +156,7 @@ void add_device_reduce_instance_blockwise_second_call(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace device
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp
View file @
b7a6f810
...
@@ -11,25 +11,25 @@ namespace device {
...
@@ -11,25 +11,25 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp
View file @
b7a6f810
...
@@ -11,16 +11,16 @@ namespace device {
...
@@ -11,16 +11,16 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp
View file @
b7a6f810
...
@@ -11,34 +11,34 @@ namespace device {
...
@@ -11,34 +11,34 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp
View file @
b7a6f810
...
@@ -11,16 +11,16 @@ namespace device {
...
@@ -11,16 +11,16 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp
View file @
b7a6f810
...
@@ -11,34 +11,34 @@ namespace device {
...
@@ -11,34 +11,34 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
//
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
View file @
b7a6f810
...
@@ -59,7 +59,7 @@ template <typename InDataType,
...
@@ -59,7 +59,7 @@ template <typename InDataType,
typename
AccDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
typename
ReduceDim
s
,
int
Num
ReduceDim
,
ReduceTensorOp_t
ReduceOpId
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
>
ReduceTensorIndices_t
IndicesOpt
>
...
@@ -110,7 +110,7 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -110,7 +110,7 @@ void add_device_reduce_instance_multiblock_atomic_add(
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
ReduceDim
s
,
Num
ReduceDim
,
ReduceOperation
,
ReduceOperation
,
InElementwiseOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
AccElementwiseOperation
,
...
@@ -133,12 +133,12 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -133,12 +133,12 @@ void add_device_reduce_instance_multiblock_atomic_add(
};
};
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
template void add_device_reduce_instance_multiblock_atomic_add<inT, \
template void add_device_reduce_instance_multiblock_atomic_add<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
Sequence<__VA_ARGS__>
, \
NumReduceDim
, \
ReduceOpId, \
ReduceOpId, \
NanOpt, \
NanOpt, \
IndicesOpt>( \
IndicesOpt>( \
...
@@ -146,7 +146,7 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -146,7 +146,7 @@ void add_device_reduce_instance_multiblock_atomic_add(
device_op_instances)
device_op_instances)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
...
@@ -154,15 +154,15 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -154,15 +154,15 @@ void add_device_reduce_instance_multiblock_atomic_add(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
extern template void add_device_reduce_instance_multiblock_atomic_add<inT, \
extern template void add_device_reduce_instance_multiblock_atomic_add<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
Sequence<__VA_ARGS__>,
\
NumReduceDim,
\
ReduceOpId, \
ReduceOpId, \
NanOpt, \
NanOpt, \
IndicesOpt>( \
IndicesOpt>( \
...
@@ -173,7 +173,7 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -173,7 +173,7 @@ void add_device_reduce_instance_multiblock_atomic_add(
device_op_instances)
device_op_instances)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
...
@@ -181,7 +181,7 @@ void add_device_reduce_instance_multiblock_atomic_add(
...
@@ -181,7 +181,7 @@ void add_device_reduce_instance_multiblock_atomic_add(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace device
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
View file @
b7a6f810
...
@@ -11,13 +11,13 @@ namespace device {
...
@@ -11,13 +11,13 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
half_t
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
View file @
b7a6f810
...
@@ -11,13 +11,13 @@ namespace device {
...
@@ -11,13 +11,13 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
float
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
View file @
b7a6f810
...
@@ -11,13 +11,13 @@ namespace device {
...
@@ -11,13 +11,13 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
0
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID
(
float
,
double
,
float
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce.hpp
View file @
b7a6f810
...
@@ -55,7 +55,7 @@ template <typename InDataType,
...
@@ -55,7 +55,7 @@ template <typename InDataType,
typename
AccDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
typename
ReduceDim
s
,
int
Num
ReduceDim
,
ReduceTensorOp_t
ReduceOpId
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
>
ReduceTensorIndices_t
IndicesOpt
>
...
@@ -93,7 +93,7 @@ void add_device_reduce_instance_multiblock_partial_reduce(
...
@@ -93,7 +93,7 @@ void add_device_reduce_instance_multiblock_partial_reduce(
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
ReduceDim
s
,
Num
ReduceDim
,
ReduceOperation
,
ReduceOperation
,
InElementwiseOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
AccElementwiseOperation
,
...
@@ -114,12 +114,12 @@ void add_device_reduce_instance_multiblock_partial_reduce(
...
@@ -114,12 +114,12 @@ void add_device_reduce_instance_multiblock_partial_reduce(
};
};
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE( \
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
template void add_device_reduce_instance_multiblock_partial_reduce<inT, \
template void add_device_reduce_instance_multiblock_partial_reduce<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
Sequence<__VA_ARGS__>
, \
NumReduceDim
, \
ReduceOpId, \
ReduceOpId, \
NanOpt, \
NanOpt, \
IndicesOpt>( \
IndicesOpt>( \
...
@@ -127,7 +127,7 @@ void add_device_reduce_instance_multiblock_partial_reduce(
...
@@ -127,7 +127,7 @@ void add_device_reduce_instance_multiblock_partial_reduce(
device_op_instances)
device_op_instances)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
...
@@ -135,16 +135,15 @@ void add_device_reduce_instance_multiblock_partial_reduce(
...
@@ -135,16 +135,15 @@ void add_device_reduce_instance_multiblock_partial_reduce(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
extern template void \
extern template void add_device_reduce_instance_multiblock_partial_reduce<inT, \
add_device_reduce_instance_multiblock_partial_reduce<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
Sequence<__VA_ARGS__>,
\
NumReduceDim,
\
ReduceOpId, \
ReduceOpId, \
NanOpt, \
NanOpt, \
IndicesOpt>( \
IndicesOpt>( \
...
@@ -156,7 +155,7 @@ void add_device_reduce_instance_multiblock_partial_reduce(
...
@@ -156,7 +155,7 @@ void add_device_reduce_instance_multiblock_partial_reduce(
device_op_instances)
device_op_instances)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
...
@@ -164,7 +163,7 @@ void add_device_reduce_instance_multiblock_partial_reduce(
...
@@ -164,7 +163,7 @@ void add_device_reduce_instance_multiblock_partial_reduce(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace device
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp
View file @
b7a6f810
...
@@ -11,25 +11,25 @@ namespace device {
...
@@ -11,25 +11,25 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp
View file @
b7a6f810
...
@@ -11,16 +11,16 @@ namespace device {
...
@@ -11,16 +11,16 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp
View file @
b7a6f810
...
@@ -11,29 +11,29 @@ namespace device {
...
@@ -11,29 +11,29 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
4
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
float
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp
View file @
b7a6f810
...
@@ -11,10 +11,10 @@ namespace device {
...
@@ -11,10 +11,10 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
float
,
double
,
float
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp
View file @
b7a6f810
...
@@ -11,37 +11,37 @@ namespace device {
...
@@ -11,37 +11,37 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
4
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
7
,
0
,
0
,
2
,
1
);
// Will be moved to use MultiBlockAtomicAdd
// Will be moved to use MultiBlockAtomicAdd
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
0
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
//
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID
(
double
,
double
,
double
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
View file @
b7a6f810
...
@@ -57,7 +57,7 @@ template <typename InDataType,
...
@@ -57,7 +57,7 @@ template <typename InDataType,
typename
AccDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
typename
ReduceDim
s
,
int
Num
ReduceDim
,
ReduceTensorOp_t
ReduceOpId
,
ReduceTensorOp_t
ReduceOpId
,
NanPropagation_t
NanOpt
,
NanPropagation_t
NanOpt
,
ReduceTensorIndices_t
IndicesOpt
>
ReduceTensorIndices_t
IndicesOpt
>
...
@@ -89,7 +89,7 @@ void add_device_reduce_instance_threadwise(
...
@@ -89,7 +89,7 @@ void add_device_reduce_instance_threadwise(
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
ReduceDim
s
,
Num
ReduceDim
,
ReduceOperation
,
ReduceOperation
,
InElementwiseOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
AccElementwiseOperation
,
...
@@ -108,18 +108,20 @@ void add_device_reduce_instance_threadwise(
...
@@ -108,18 +108,20 @@ void add_device_reduce_instance_threadwise(
});
});
};
};
#define ADD_THREADWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
#define ADD_THREADWISE_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
template void add_device_reduce_instance_threadwise<inT, \
template void add_device_reduce_instance_threadwise<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
Sequence<__VA_ARGS__>,
\
NumReduceDim,
\
ReduceOpId, \
ReduceOpId, \
NanOpt, \
NanOpt, \
IndicesOpt>( \
IndicesOpt>( \
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_THREADWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
#define ADD_THREADWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_BY_TYPE(inT, \
ADD_THREADWISE_INST_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
...
@@ -127,15 +129,15 @@ void add_device_reduce_instance_threadwise(
...
@@ -127,15 +129,15 @@ void add_device_reduce_instance_threadwise(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
#define ADD_THREADWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
...)
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank,
NumReduceDim)
\
extern template void add_device_reduce_instance_threadwise<inT, \
extern template void add_device_reduce_instance_threadwise<inT, \
compT, \
compT, \
outT, \
outT, \
Rank, \
Rank, \
Sequence<__VA_ARGS__>,
\
NumReduceDim,
\
ReduceOpId, \
ReduceOpId, \
NanOpt, \
NanOpt, \
IndicesOpt>( \
IndicesOpt>( \
...
@@ -145,7 +147,8 @@ void add_device_reduce_instance_threadwise(
...
@@ -145,7 +147,8 @@ void add_device_reduce_instance_threadwise(
AccElementwiseOperation>> & \
AccElementwiseOperation>> & \
device_op_instances)
device_op_instances)
#define ADD_THREADWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \
#define ADD_THREADWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
compT, \
compT, \
outT, \
outT, \
...
@@ -153,7 +156,7 @@ void add_device_reduce_instance_threadwise(
...
@@ -153,7 +156,7 @@ void add_device_reduce_instance_threadwise(
static_cast<NanPropagation_t>(NanOpt), \
static_cast<NanPropagation_t>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \
Rank, \
Rank, \
__VA_ARGS__
)
NumReduceDim
)
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace device
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
View file @
b7a6f810
...
@@ -11,25 +11,25 @@ namespace device {
...
@@ -11,25 +11,25 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MIN
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
2
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
,
1
,
2
);
// for MAX
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
3
,
0
,
1
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
,
1
,
2
);
// for AMAX
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
0
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
half_t
,
half_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp
View file @
b7a6f810
...
@@ -11,16 +11,16 @@ namespace device {
...
@@ -11,16 +11,16 @@ namespace device {
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDim
s
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank |
Num
ReduceDim
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
,
1
,
2
);
// for ADD
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
0
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
0
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
,
1
,
2
);
// for AVG
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
5
,
0
,
0
,
2
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
,
1
,
2
);
// for NORM2
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
3
);
// for NORM2
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
0
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
4
,
1
);
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
//
ADD_THREADWISE_INST_REF_BY_ID
(
half_t
,
float
,
half_t
,
7
,
0
,
0
,
2
,
1
);
// clang-format on
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device_reduce_instance
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment