Merge remote-tracking branch 'origin/develop' into embeddings

478df149 · fsx950223 · 8941136f · 80e05267 · 478df149 · 478df149
Commit 478df149 authored Jan 18, 2023 by fsx950223
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
@@ -15,10 +15,10 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
@@ -15,10 +15,10 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<F32, F64, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<F32, F64, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<F32, F64, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<F32, F64, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
@@ -15,10 +15,10 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
@@ -15,14 +15,14 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
@@ -15,10 +15,10 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
@@ -15,14 +15,14 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
@@ -15,14 +15,14 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
@@ -15,10 +15,10 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<F64, F64, F64, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
@@ -15,10 +15,10 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
@@ -15,10 +15,10 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
@@ -15,14 +15,14 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
@@ -15,14 +15,14 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
@@ -15,14 +15,14 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>>&);
-extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>>&);
 // clang-format on
 } // namespace instance

--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
+#include "ck/utility/reduction_operator.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim,
+          typename ReduceOperation,
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          bool PropagateNan,
+          bool OutputIndex>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceReduce<InDataType,
+                                                                                 AccDataType,
+                                                                                 OutDataType,
+                                                                                 Rank,
+                                                                                 NumReduceDim,
+                                                                                 ReduceOperation,
+                                                                                 InElementwiseOp,
+                                                                                 AccElementwiseOp,
+                                                                                 PropagateNan,
+                                                                                 OutputIndex>>
+{
+    using DeviceOp = DeviceReduce<InDataType,
+                                  AccDataType,
+                                  OutDataType,
+                                  Rank,
+                                  NumReduceDim,
+                                  ReduceOperation,
+                                  InElementwiseOp,
+                                  AccElementwiseOp,
+                                  PropagateNan,
+                                  OutputIndex>;
+    using DeviceOpPtr = DeviceReducePtr<InDataType,
+                                        AccDataType,
+                                        OutDataType,
+                                        Rank,
+                                        NumReduceDim,
+                                        ReduceOperation,
+                                        InElementwiseOp,
+                                        AccElementwiseOp,
+                                        PropagateNan,
+                                        OutputIndex>;
+    static auto GetInstances()
+    {
+        std::vector<DeviceOpPtr> op_ptrs;
+        constexpr bool out_support_atomic_add =
+            ck::reduce::InMemoryDataOperationSupportedOnDataType<
+                InMemoryDataOperationEnum::AtomicAdd,
+                OutDataType>::value;
+        constexpr bool op_support_atomic_add =
+            std::is_same<ReduceOperation, ReduceAdd>::value &&
+            (std::is_same<AccElementwiseOp, PassThrough>::value ||
+             std::is_same<AccElementwiseOp, UnaryDivide>::value);
+        constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add);
+        add_device_reduce_instance_threadwise<InDataType,
+                                              AccDataType,
+                                              OutDataType,
+                                              Rank,
+                                              NumReduceDim,
+                                              ReduceOperation,
+                                              InElementwiseOp,
+                                              AccElementwiseOp,
+                                              PropagateNan,
+                                              OutputIndex>(op_ptrs);
+        add_device_reduce_instance_blockwise<InDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             Rank,
+                                             NumReduceDim,
+                                             ReduceOperation,
+                                             InElementwiseOp,
+                                             AccElementwiseOp,
+                                             PropagateNan,
+                                             OutputIndex>(op_ptrs);
+        if constexpr(use_atomic_add)
+        {
+            add_device_reduce_instance_multiblock_atomic_add<InDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             ReduceOperation,
+                                                             InElementwiseOp,
+                                                             AccElementwiseOp,
+                                                             PropagateNan,
+                                                             OutputIndex>(op_ptrs);
+        };
+        return op_ptrs;
+    }
+};
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/utility/host_reduction.hpp
+++ b/library/include/ck/library/utility/host_reduction.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include <vector>
-#include <array>
-#include <functional>
-#include "ck/utility/data_type.hpp"
-#include "ck/utility/reduction_enums.hpp"
-#include "ck/utility/reduction_common.hpp"
-#include "ck/utility/reduction_functions_accumulate.hpp"
-#include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-template <int NDim>
-static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
-                            std::vector<std::array<size_t, NDim>>& indexes)
-{
-    static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
-    if constexpr(NDim == 1)
-    {
-        for(size_t i = 0; i < dimLengths[0]; i++)
-        {
-            std::array<size_t, 1> index{i};
-            indexes.push_back(index);
-        };
-    }
-    else
-    {
-        std::array<size_t, NDim - 1> partial_dim_lengths;
-        for(int i = 0; i < NDim - 1; i++)
-            partial_dim_lengths[i] = dimLengths[i + 1];
-        std::vector<std::array<size_t, NDim - 1>> partial_indexes;
-        get_all_indexes<NDim - 1>(partial_dim_lengths, partial_indexes);
-        for(size_t i = 0; i < dimLengths[0]; i++)
-            for(const auto& index : partial_indexes)
-            {
-                std::array<size_t, NDim> extIndex;
-                extIndex[0] = i;
-                for(int k = 0; k < NDim - 1; k++)
-                    extIndex[k + 1] = index[k];
-                indexes.push_back(extIndex);
-            };
-    };
-};
-template <int NDim>
-static size_t get_offset_from_index(const std::array<size_t, NDim>& strides,
-                                    const std::array<size_t, NDim>& index)
-{
-    size_t offset = 0;
-    for(int i = 0; i < NDim; i++)
-        offset += strides[i] * index[i];
-    return (offset);
-};
-template <int NDim>
-static size_t get_offset_from_index(const std::vector<size_t>& strides,
-                                    const std::array<size_t, NDim>& index)
-{
-    size_t offset = 0;
-    for(int i = 0; i < NDim; i++)
-        offset += strides[i] * index[i];
-    return (offset);
-};
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          typename ReduceOperation,
-          typename InElementwiseOperation,
-          typename AccElementwiseOperation,
-          int Rank,
-          int NumReduceDim,
-          bool PropagateNan,
-          bool OutputIndex>
-struct ReductionHost
-{
-    using IndexDataType = int32_t;
-    static constexpr int NumInvariantDim = Rank - NumReduceDim;
-    std::vector<size_t> outStrides;
-    IndexDataType divider;
-    std::array<size_t, NumReduceDim> reduceLengths;
-    std::array<size_t, NumReduceDim> reduceStrides;
-    std::array<size_t, NumInvariantDim> invariantLengths;
-    std::array<size_t, NumInvariantDim> invariantStrides;
-    std::vector<std::array<size_t, NumReduceDim>> reduce_dim_indexes;
-    std::vector<std::array<size_t, NumInvariantDim>> invariant_dim_indexes;
-    ReductionHost(HostTensorDescriptor& inDesc,
-                  HostTensorDescriptor& outDesc,
-                  const std::array<int, NumInvariantDim> invariantDims,
-                  const std::array<int, NumReduceDim> reduceDims)
-    {
-        // this->outLengths = to_int_vector(outDesc.GetLengths());
-        this->outStrides = outDesc.GetStrides();
-        int product = 1;
-        for(int i = 0; i < NumReduceDim; i++)
-        {
-            reduceLengths[i] = inDesc.GetLengths()[reduceDims[i]];
-            reduceStrides[i] = inDesc.GetStrides()[reduceDims[i]];
-            product *= inDesc.GetLengths()[reduceDims[i]];
-        };
-        divider = product;
-        for(int i = 0; i < NumInvariantDim; i++)
-        {
-            invariantLengths[i] = inDesc.GetLengths()[invariantDims[i]];
-            invariantStrides[i] = inDesc.GetStrides()[invariantDims[i]];
-        };
-        reduce_dim_indexes.clear();
-        get_all_indexes<NumReduceDim>(reduceLengths, reduce_dim_indexes);
-        if constexpr(NumInvariantDim > 0)
-        {
-            invariant_dim_indexes.clear();
-            get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
-        };
-    };
-    void Run(float alpha,
-             const InDataType* in_data,
-             float beta,
-             OutDataType* out_data,
-             IndexDataType* out_indices,
-             InElementwiseOperation in_elementwise_op,
-             AccElementwiseOperation acc_elementwise_op)
-    {
-        if constexpr(OutputIndex)
-        {
-            RunImpl_with_index(
-                alpha, in_data, beta, out_data, out_indices, in_elementwise_op, acc_elementwise_op);
-        }
-        else
-        {
-            RunImpl_no_index(alpha, in_data, beta, out_data, in_elementwise_op, acc_elementwise_op);
-        };
-    };
-    void RunImpl_with_index(float alpha,
-                            const InDataType* in_data,
-                            float beta,
-                            OutDataType* out_data,
-                            IndexDataType* out_indices,
-                            InElementwiseOperation in_elementwise_op,
-                            AccElementwiseOperation acc_elementwise_op)
-    {
-        using ck::float_equal_one;
-        using ck::float_equal_zero;
-        using ck::type_convert;
-        using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
-                                                                        ReduceOperation,
-                                                                        AccDataType,
-                                                                        IndexDataType>;
-        if constexpr(NumInvariantDim == 0)
-        {
-            AccDataType accuVal     = ReduceOperation::template GetIdentityValue<AccDataType>();
-            IndexDataType accuIndex = 0;
-            for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
-            {
-                auto offset_reduce =
-                    get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
-                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
-                in_elementwise_op(currVal, currVal);
-                auto currIndex = static_cast<IndexDataType>(i);
-                Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
-            };
-            acc_elementwise_op(accuVal, accuVal);
-            if(!float_equal_one{}(alpha))
-                accuVal *= type_convert<AccDataType>(alpha);
-            if(!float_equal_zero{}(beta))
-                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
-            out_data[0]    = type_convert<OutDataType>(accuVal);
-            out_indices[0] = accuIndex;
-        }
-        else
-        {
-            auto thread_reduce_func = [&](auto invariant_index) {
-                AccDataType accuVal     = ReduceOperation::template GetIdentityValue<AccDataType>();
-                IndexDataType accuIndex = 0;
-                auto offset_invariant =
-                    get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
-                for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
-                {
-                    auto offset_reduce =
-                        get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
-                    auto currVal =
-                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
-                    in_elementwise_op(currVal, currVal);
-                    auto currIndex = static_cast<IndexDataType>(i);
-                    Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
-                };
-                acc_elementwise_op(accuVal, accuVal);
-                if(!float_equal_one{}(alpha))
-                    accuVal *= type_convert<AccDataType>(alpha);
-                auto dst_offset =
-                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
-                if(!float_equal_zero{}(beta))
-                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
-                               type_convert<AccDataType>(beta);
-                out_data[dst_offset]    = type_convert<OutDataType>(accuVal);
-                out_indices[dst_offset] = accuIndex;
-            };
-            std::size_t num_thread = 1;
-            std::size_t work_per_thread =
-                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
-            std::vector<joinable_thread> threads(num_thread);
-            for(std::size_t it = 0; it < num_thread; ++it)
-            {
-                std::size_t iw_begin = it * work_per_thread;
-                std::size_t iw_end =
-                    std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
-                auto f = [=] {
-                    for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
-                    {
-                        thread_reduce_func(invariant_dim_indexes[iw]);
-                    }
-                };
-                threads[it] = joinable_thread(f);
-            }
-        };
-    };
-    void RunImpl_no_index(float alpha,
-                          const InDataType* in_data,
-                          float beta,
-                          OutDataType* out_data,
-                          InElementwiseOperation in_elementwise_op,
-                          AccElementwiseOperation acc_elementwise_op)
-    {
-        using ck::float_equal_one;
-        using ck::float_equal_zero;
-        using ck::type_convert;
-        using Accumulation =
-            ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
-        if constexpr(NumInvariantDim == 0)
-        {
-            AccDataType accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
-            for(const auto& reduce_index : reduce_dim_indexes)
-            {
-                auto offset_reduce =
-                    get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
-                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
-                in_elementwise_op(currVal, currVal);
-                Accumulation::Calculate(accuVal, currVal);
-            };
-            acc_elementwise_op(accuVal, accuVal);
-            if(!float_equal_one{}(alpha))
-                accuVal *= type_convert<AccDataType>(alpha);
-            if(!float_equal_zero{}(beta))
-                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
-            out_data[0] = type_convert<OutDataType>(accuVal);
-        }
-        else
-        {
-            auto thread_reduce_func = [&](auto invariant_index) {
-                AccDataType accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
-                auto offset_invariant =
-                    get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
-                for(const auto& reduce_index : reduce_dim_indexes)
-                {
-                    auto offset_reduce =
-                        get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
-                    auto currVal =
-                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
-                    in_elementwise_op(currVal, currVal);
-                    Accumulation::Calculate(accuVal, currVal);
-                };
-                acc_elementwise_op(accuVal, accuVal);
-                if(!float_equal_one{}(alpha))
-                    accuVal *= type_convert<AccDataType>(alpha);
-                auto dst_offset =
-                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
-                if(!float_equal_zero{}(beta))
-                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
-                               type_convert<AccDataType>(beta);
-                out_data[dst_offset] = type_convert<OutDataType>(accuVal);
-            };
-            std::size_t num_thread = 1;
-            std::size_t work_per_thread =
-                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
-            std::vector<joinable_thread> threads(num_thread);
-            for(std::size_t it = 0; it < num_thread; ++it)
-            {
-                std::size_t iw_begin = it * work_per_thread;
-                std::size_t iw_end =
-                    std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
-                auto f = [=] {
-                    for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
-                    {
-                        thread_reduce_func(invariant_dim_indexes[iw]);
-                    }
-                };
-                threads[it] = joinable_thread(f);
-            }
-        };
-    };
-};
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
@@ -11,10 +11,10 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>>&); 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>>&);
 // clang-format on
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
@@ -11,14 +11,14 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
 // clang-format on
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
@@ -11,10 +11,10 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&); 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>>&);
 // clang-format on
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
@@ -11,14 +11,14 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>>&);
 // clang-format on
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
@@ -11,14 +11,14 @@ namespace instance {
 // clang-format off
 // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>>&);
 // clang-format on
 } // namespace instance