Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
478df149
Commit
478df149
authored
Jan 18, 2023
by
fsx950223
Browse files
Merge remote-tracking branch 'origin/develop' into embeddings
parents
8941136f
80e05267
Changes
211
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
225 additions
and
482 deletions
+225
-482
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
...uce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
+4
-4
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
...e/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
+4
-4
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
+4
-4
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
...ce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
+8
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
+4
-4
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
+8
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
+8
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
...e/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
+4
-4
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
...educe/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
+4
-4
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
...educe/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
+4
-4
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
...educe/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
+8
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
+8
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
+8
-8
library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
...k/library/tensor_operation_instance/gpu/reduce/reduce.hpp
+117
-0
library/include/ck/library/utility/host_reduction.hpp
library/include/ck/library/utility/host_reduction.hpp
+0
-374
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
...duce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
...uce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
...duce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
...duce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
...duce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
+8
-8
No files found.
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
View file @
478df149
...
@@ -15,10 +15,10 @@ namespace instance {
...
@@ -15,10 +15,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
View file @
478df149
...
@@ -15,10 +15,10 @@ namespace instance {
...
@@ -15,10 +15,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
View file @
478df149
...
@@ -15,10 +15,10 @@ namespace instance {
...
@@ -15,10 +15,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
View file @
478df149
...
@@ -15,14 +15,14 @@ namespace instance {
...
@@ -15,14 +15,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
View file @
478df149
...
@@ -15,10 +15,10 @@ namespace instance {
...
@@ -15,10 +15,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
View file @
478df149
...
@@ -15,14 +15,14 @@ namespace instance {
...
@@ -15,14 +15,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
View file @
478df149
...
@@ -15,14 +15,14 @@ namespace instance {
...
@@ -15,14 +15,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
View file @
478df149
...
@@ -15,10 +15,10 @@ namespace instance {
...
@@ -15,10 +15,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
View file @
478df149
...
@@ -15,10 +15,10 @@ namespace instance {
...
@@ -15,10 +15,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
View file @
478df149
...
@@ -15,10 +15,10 @@ namespace instance {
...
@@ -15,10 +15,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
View file @
478df149
...
@@ -15,14 +15,14 @@ namespace instance {
...
@@ -15,14 +15,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
View file @
478df149
...
@@ -15,14 +15,14 @@ namespace instance {
...
@@ -15,14 +15,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
View file @
478df149
...
@@ -15,14 +15,14 @@ namespace instance {
...
@@ -15,14 +15,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
extern
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
0 → 100644
View file @
478df149
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
#include "ck/utility/reduction_operator.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOp
,
typename
AccElementwiseOp
,
bool
PropagateNan
,
bool
OutputIndex
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceReduce
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOp
,
AccElementwiseOp
,
PropagateNan
,
OutputIndex
>>
{
using
DeviceOp
=
DeviceReduce
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOp
,
AccElementwiseOp
,
PropagateNan
,
OutputIndex
>
;
using
DeviceOpPtr
=
DeviceReducePtr
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOp
,
AccElementwiseOp
,
PropagateNan
,
OutputIndex
>
;
static
auto
GetInstances
()
{
std
::
vector
<
DeviceOpPtr
>
op_ptrs
;
constexpr
bool
out_support_atomic_add
=
ck
::
reduce
::
InMemoryDataOperationSupportedOnDataType
<
InMemoryDataOperationEnum
::
AtomicAdd
,
OutDataType
>::
value
;
constexpr
bool
op_support_atomic_add
=
std
::
is_same
<
ReduceOperation
,
ReduceAdd
>::
value
&&
(
std
::
is_same
<
AccElementwiseOp
,
PassThrough
>::
value
||
std
::
is_same
<
AccElementwiseOp
,
UnaryDivide
>::
value
);
constexpr
bool
use_atomic_add
=
(
out_support_atomic_add
&&
op_support_atomic_add
);
add_device_reduce_instance_threadwise
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOp
,
AccElementwiseOp
,
PropagateNan
,
OutputIndex
>
(
op_ptrs
);
add_device_reduce_instance_blockwise
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOp
,
AccElementwiseOp
,
PropagateNan
,
OutputIndex
>
(
op_ptrs
);
if
constexpr
(
use_atomic_add
)
{
add_device_reduce_instance_multiblock_atomic_add
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOp
,
AccElementwiseOp
,
PropagateNan
,
OutputIndex
>
(
op_ptrs
);
};
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/utility/host_reduction.hpp
deleted
100644 → 0
View file @
8941136f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <array>
#include <functional>
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_common.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_tensor.hpp"
template
<
int
NDim
>
static
void
get_all_indexes
(
const
std
::
array
<
size_t
,
NDim
>&
dimLengths
,
std
::
vector
<
std
::
array
<
size_t
,
NDim
>>&
indexes
)
{
static_assert
(
NDim
>=
1
,
"NDim >= 1 is required to use this function!"
);
if
constexpr
(
NDim
==
1
)
{
for
(
size_t
i
=
0
;
i
<
dimLengths
[
0
];
i
++
)
{
std
::
array
<
size_t
,
1
>
index
{
i
};
indexes
.
push_back
(
index
);
};
}
else
{
std
::
array
<
size_t
,
NDim
-
1
>
partial_dim_lengths
;
for
(
int
i
=
0
;
i
<
NDim
-
1
;
i
++
)
partial_dim_lengths
[
i
]
=
dimLengths
[
i
+
1
];
std
::
vector
<
std
::
array
<
size_t
,
NDim
-
1
>>
partial_indexes
;
get_all_indexes
<
NDim
-
1
>
(
partial_dim_lengths
,
partial_indexes
);
for
(
size_t
i
=
0
;
i
<
dimLengths
[
0
];
i
++
)
for
(
const
auto
&
index
:
partial_indexes
)
{
std
::
array
<
size_t
,
NDim
>
extIndex
;
extIndex
[
0
]
=
i
;
for
(
int
k
=
0
;
k
<
NDim
-
1
;
k
++
)
extIndex
[
k
+
1
]
=
index
[
k
];
indexes
.
push_back
(
extIndex
);
};
};
};
template
<
int
NDim
>
static
size_t
get_offset_from_index
(
const
std
::
array
<
size_t
,
NDim
>&
strides
,
const
std
::
array
<
size_t
,
NDim
>&
index
)
{
size_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
NDim
;
i
++
)
offset
+=
strides
[
i
]
*
index
[
i
];
return
(
offset
);
};
template
<
int
NDim
>
static
size_t
get_offset_from_index
(
const
std
::
vector
<
size_t
>&
strides
,
const
std
::
array
<
size_t
,
NDim
>&
index
)
{
size_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
NDim
;
i
++
)
offset
+=
strides
[
i
]
*
index
[
i
];
return
(
offset
);
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
int
Rank
,
int
NumReduceDim
,
bool
PropagateNan
,
bool
OutputIndex
>
struct
ReductionHost
{
using
IndexDataType
=
int32_t
;
static
constexpr
int
NumInvariantDim
=
Rank
-
NumReduceDim
;
std
::
vector
<
size_t
>
outStrides
;
IndexDataType
divider
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceLengths
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceStrides
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantLengths
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantStrides
;
std
::
vector
<
std
::
array
<
size_t
,
NumReduceDim
>>
reduce_dim_indexes
;
std
::
vector
<
std
::
array
<
size_t
,
NumInvariantDim
>>
invariant_dim_indexes
;
ReductionHost
(
HostTensorDescriptor
&
inDesc
,
HostTensorDescriptor
&
outDesc
,
const
std
::
array
<
int
,
NumInvariantDim
>
invariantDims
,
const
std
::
array
<
int
,
NumReduceDim
>
reduceDims
)
{
// this->outLengths = to_int_vector(outDesc.GetLengths());
this
->
outStrides
=
outDesc
.
GetStrides
();
int
product
=
1
;
for
(
int
i
=
0
;
i
<
NumReduceDim
;
i
++
)
{
reduceLengths
[
i
]
=
inDesc
.
GetLengths
()[
reduceDims
[
i
]];
reduceStrides
[
i
]
=
inDesc
.
GetStrides
()[
reduceDims
[
i
]];
product
*=
inDesc
.
GetLengths
()[
reduceDims
[
i
]];
};
divider
=
product
;
for
(
int
i
=
0
;
i
<
NumInvariantDim
;
i
++
)
{
invariantLengths
[
i
]
=
inDesc
.
GetLengths
()[
invariantDims
[
i
]];
invariantStrides
[
i
]
=
inDesc
.
GetStrides
()[
invariantDims
[
i
]];
};
reduce_dim_indexes
.
clear
();
get_all_indexes
<
NumReduceDim
>
(
reduceLengths
,
reduce_dim_indexes
);
if
constexpr
(
NumInvariantDim
>
0
)
{
invariant_dim_indexes
.
clear
();
get_all_indexes
<
NumInvariantDim
>
(
invariantLengths
,
invariant_dim_indexes
);
};
};
void
Run
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
,
InElementwiseOperation
in_elementwise_op
,
AccElementwiseOperation
acc_elementwise_op
)
{
if
constexpr
(
OutputIndex
)
{
RunImpl_with_index
(
alpha
,
in_data
,
beta
,
out_data
,
out_indices
,
in_elementwise_op
,
acc_elementwise_op
);
}
else
{
RunImpl_no_index
(
alpha
,
in_data
,
beta
,
out_data
,
in_elementwise_op
,
acc_elementwise_op
);
};
};
void
RunImpl_with_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
,
InElementwiseOperation
in_elementwise_op
,
AccElementwiseOperation
acc_elementwise_op
)
{
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
Accumulation
=
ck
::
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
if
constexpr
(
NumInvariantDim
==
0
)
{
AccDataType
accuVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>();
IndexDataType
accuIndex
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
in_elementwise_op
(
currVal
,
currVal
);
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
);
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
{}(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_indices
[
0
]
=
accuIndex
;
}
else
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>();
IndexDataType
accuIndex
=
0
;
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
for
(
std
::
size_t
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
in_elementwise_op
(
currVal
,
currVal
);
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
);
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
{}(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_indices
[
dst_offset
]
=
accuIndex
;
};
std
::
size_t
num_thread
=
1
;
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
std
::
vector
<
joinable_thread
>
threads
(
num_thread
);
for
(
std
::
size_t
it
=
0
;
it
<
num_thread
;
++
it
)
{
std
::
size_t
iw_begin
=
it
*
work_per_thread
;
std
::
size_t
iw_end
=
std
::
min
((
it
+
1
)
*
work_per_thread
,
invariant_dim_indexes
.
size
());
auto
f
=
[
=
]
{
for
(
std
::
size_t
iw
=
iw_begin
;
iw
<
iw_end
;
++
iw
)
{
thread_reduce_func
(
invariant_dim_indexes
[
iw
]);
}
};
threads
[
it
]
=
joinable_thread
(
f
);
}
};
};
void
RunImpl_no_index
(
float
alpha
,
const
InDataType
*
in_data
,
float
beta
,
OutDataType
*
out_data
,
InElementwiseOperation
in_elementwise_op
,
AccElementwiseOperation
acc_elementwise_op
)
{
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
Accumulation
=
ck
::
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
if
constexpr
(
NumInvariantDim
==
0
)
{
AccDataType
accuVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>();
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_index
);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
in_elementwise_op
(
currVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
};
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
if
(
!
float_equal_zero
{}(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
0
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
0
]
=
type_convert
<
OutDataType
>
(
accuVal
);
}
else
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>();
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_index
);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
in_elementwise_op
(
currVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
};
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
auto
dst_offset
=
get_offset_from_index
<
NumInvariantDim
>
(
outStrides
,
invariant_index
);
if
(
!
float_equal_zero
{}(
beta
))
accuVal
+=
type_convert
<
AccDataType
>
(
out_data
[
dst_offset
])
*
type_convert
<
AccDataType
>
(
beta
);
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
};
std
::
size_t
num_thread
=
1
;
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
std
::
vector
<
joinable_thread
>
threads
(
num_thread
);
for
(
std
::
size_t
it
=
0
;
it
<
num_thread
;
++
it
)
{
std
::
size_t
iw_begin
=
it
*
work_per_thread
;
std
::
size_t
iw_end
=
std
::
min
((
it
+
1
)
*
work_per_thread
,
invariant_dim_indexes
.
size
());
auto
f
=
[
=
]
{
for
(
std
::
size_t
iw
=
iw_begin
;
iw
<
iw_end
;
++
iw
)
{
thread_reduce_func
(
invariant_dim_indexes
[
iw
]);
}
};
threads
[
it
]
=
joinable_thread
(
f
);
}
};
};
};
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
View file @
478df149
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
View file @
478df149
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
View file @
478df149
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
View file @
478df149
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
View file @
478df149
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_blockwise
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
BF16
,
F32
,
BF16
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
Prev
1
…
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment