Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
644df335
Commit
644df335
authored
Jan 30, 2023
by
rocking
Browse files
Merge branch 'develop' into gemm_layernorm_instance
parents
d99640ab
7494c1c6
Changes
254
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
116 additions
and
116 deletions
+116
-116
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
...uce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
...ce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
...uce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
...uce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
...uce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
...e/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
...uce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
...uce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
...e/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
...ce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
...uce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
...e/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
...educe/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
...educe/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
+4
-4
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
...educe/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
+8
-8
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
...reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
+8
-8
No files found.
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
View file @
644df335
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
View file @
644df335
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
View file @
644df335
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F32
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F32
,
F64
,
F32
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
View file @
644df335
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
View file @
644df335
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
View file @
644df335
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
3
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
4
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
4
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnarySquare
,
UnarySqrt
>>&
);
template
void
add_device_reduce_instance_threadwise
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
F64
,
F64
,
F64
,
2
,
1
,
ReduceAdd
,
UnarySquare
,
UnarySqrt
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
// clang-format on
// clang-format on
// clang-format on
// clang-format on
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
View file @
644df335
...
@@ -11,10 +11,10 @@ namespace instance {
...
@@ -11,10 +11,10 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
3
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
4
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
4
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
UnaryDivide
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I32
,
I8
,
2
,
1
,
ReduceAdd
,
PassThrough
,
UnaryDivide
,
false
,
false
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
View file @
644df335
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
UnaryAbs
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceAMax
,
UnaryAbs
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
View file @
644df335
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMax
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
View file @
644df335
...
@@ -11,14 +11,14 @@ namespace instance {
...
@@ -11,14 +11,14 @@ namespace instance {
// clang-format off
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
false
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
3
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
3
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
4
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
4
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
4
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
4
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
2
,
1
,
PassThrough
,
PassThrough
>>&
);
template
void
add_device_reduce_instance_threadwise
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>(
std
::
vector
<
DeviceReducePtr
<
I8
,
I8
,
I8
,
2
,
1
,
ReduceMin
,
PassThrough
,
PassThrough
,
false
,
true
>>&
);
// clang-format on
// clang-format on
}
// namespace instance
}
// namespace instance
...
...
Prev
1
…
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment