Merge remote-tracking branch 'origin/develop' into contraction

7a3b49e5 · Chao Liu · e07b3d8e · d3051d75 · 7a3b49e5 · 7a3b49e5
Commit 7a3b49e5 authored Jun 25, 2022 by Chao Liu
20 changed files
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
-#ifndef DEVICE_REDUCE_INSTANTCE_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANTCE_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_blockwise_f16_f16_f16.hpp"
+#pragma once
-#include "device_reduce_instance_blockwise_f16_f32_f16.hpp"
-#include "device_reduce_instance_blockwise_f32_f32_f32.hpp"
-#include "device_reduce_instance_blockwise_f32_f64_f32.hpp"
-#include "device_reduce_instance_blockwise_f64_f64_f64.hpp"
-#include "device_reduce_instance_blockwise_i8_i8_i8.hpp"
-#include "device_reduce_instance_blockwise_i8_i32_i8.hpp"
-#include "device_reduce_instance_blockwise_b16_f32_b16.hpp"
-#include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
-#include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
-#include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
-#include "device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp"
-#include "device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp"
-#include "device_reduce_instance_threadwise_f16_f16_f16.hpp"
-#include "device_reduce_instance_threadwise_f16_f32_f16.hpp"
-#include "device_reduce_instance_threadwise_f32_f32_f32.hpp"
-#include "device_reduce_instance_threadwise_f32_f64_f32.hpp"
-#include "device_reduce_instance_threadwise_f64_f64_f64.hpp"
-#include "device_reduce_instance_threadwise_i8_i8_i8.hpp"
-#include "device_reduce_instance_threadwise_i8_i32_i8.hpp"
-#include "device_reduce_instance_threadwise_b16_f32_b16.hpp"
-#endif
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp"
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "reduction_operator_mapping.hpp"
+#pragma once
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -61,10 +63,10 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
    >;
 #endif
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
+template <ReduceTensorOp ReduceOpId>
 using deviceReduceBlockWisePtrType = DeviceReducePtr<
-    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
+    typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation,
-    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
+    typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation>;
 template <typename InDataType,
          typename AccDataType,
@@ -75,14 +77,13 @@ template <typename InDataType,
          bool PropagateNan,
          bool UseIndex>
 void add_device_reduce_instance_blockwise(
-    std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
+    std::vector<deviceReduceBlockWisePtrType<ReduceOpId>>& device_op_instances)
 {
-    using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
    using InElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
    using AccElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
-            AccElementwiseOperation;
    constexpr bool Indexable =
        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
@@ -137,7 +138,7 @@ void add_device_reduce_instance_blockwise(
                                                       ReduceOpId,            \
                                                       PropagateNan,          \
                                                       UseIndex>(             \
-        std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
+        std::vector<deviceReduceBlockWisePtrType<ReduceOpId>> & device_op_instances)
 #define ADD_BLOCKWISE_INST_BY_ID(                                         \
    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
@@ -150,21 +151,17 @@ void add_device_reduce_instance_blockwise(
                               Rank,                                      \
                               NumReduceDim)
-#define ADD_BLOCKWISE_INST_REF_BY_TYPE(                                                            \
+#define ADD_BLOCKWISE_INST_REF_BY_TYPE(                                       \
-    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim)                      \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
-    extern template void add_device_reduce_instance_blockwise<inT,                                 \
+    extern template void add_device_reduce_instance_blockwise<inT,            \
-                                                              compT,                               \
+                                                              compT,          \
-                                                              outT,                                \
+                                                              outT,           \
-                                                              Rank,                                \
+                                                              Rank,           \
-                                                              NumReduceDim,                        \
+                                                              NumReduceDim,   \
-                                                              ReduceOpId,                          \
+                                                              ReduceOpId,     \
-                                                              PropagateNan,                        \
+                                                              PropagateNan,   \
-                                                              UseIndex>(                           \
+                                                              UseIndex>(      \
-        std::vector<DeviceReducePtr<                                                               \
+        std::vector<deviceReduceBlockWisePtrType<ReduceOpId>> & device_op_instances)
-            typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
-            typename reduce_unary_operator<compT, ReduceOpId, true, true>::                        \
-                AccElementwiseOperation>> &                                                        \
-        device_op_instances)
 #define ADD_BLOCKWISE_INST_REF_BY_ID(                                       \
    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)   \
@@ -180,7 +177,4 @@ void add_device_reduce_instance_blockwise(
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -53,7 +56,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F16_F16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -40,7 +43,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F16_F32_F16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_blockwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -28,7 +31,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_blockwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -51,7 +55,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F32_F64_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_blockwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -27,7 +31,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_F64_F64_F64_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_blockwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -51,7 +55,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_blockwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_blockwise.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -39,7 +43,4 @@ ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_IMPL_COMMON_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
 namespace ck {
 namespace tensor_operation {
@@ -35,7 +37,4 @@ struct ReductionConfiguration_2
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "reduction_operator_mapping.hpp"
+#pragma once
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -61,12 +64,10 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
    >;
 #endif
-template <typename AccDataType, ReduceTensorOp ReduceOperation>
+template <ReduceTensorOp ReduceOperation>
-using deviceReduceMultiBlockAtomicAddPtrType =
+using deviceReduceMultiBlockAtomicAddPtrType = DeviceReducePtr<
-    DeviceReducePtr<typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>::
+    typename reduce_unary_operator<ReduceOperation, true, true>::InElementwiseOperation,
-                        InElementwiseOperation,
+    typename reduce_unary_operator<ReduceOperation, true, true>::AccElementwiseOperation>;
-                    typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>::
-                        AccElementwiseOperation>;
 template <typename InDataType,
          typename AccDataType,
@@ -77,15 +78,13 @@ template <typename InDataType,
          bool PropagateNan,
          bool UseIndex>
 void add_device_reduce_instance_multiblock_atomic_add(
-    std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>&
+    std::vector<deviceReduceMultiBlockAtomicAddPtrType<ReduceOpId>>& device_op_instances)
-        device_op_instances)
 {
-    using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
    using InElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
    using AccElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
-            AccElementwiseOperation;
    constexpr bool Indexable =
        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
@@ -158,8 +157,7 @@ void add_device_reduce_instance_multiblock_atomic_add(
                                                                   ReduceOpId,   \
                                                                   PropagateNan, \
                                                                   UseIndex>(    \
-        std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \
+        std::vector<deviceReduceMultiBlockAtomicAddPtrType<ReduceOpId>> & device_op_instances)
-        device_op_instances)
 #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(                                       \
    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)           \
@@ -172,21 +170,17 @@ void add_device_reduce_instance_multiblock_atomic_add(
                                           Rank,                                    \
                                           NumReduceDim)
-#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(                                                \
+#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(                                     \
-    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim)                      \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim)           \
-    extern template void add_device_reduce_instance_multiblock_atomic_add<inT,                     \
+    extern template void add_device_reduce_instance_multiblock_atomic_add<inT,          \
-                                                                          compT,                   \
+                                                                          compT,        \
-                                                                          outT,                    \
+                                                                          outT,         \
-                                                                          Rank,                    \
+                                                                          Rank,         \
-                                                                          NumReduceDim,            \
+                                                                          NumReduceDim, \
-                                                                          ReduceOpId,              \
+                                                                          ReduceOpId,   \
-                                                                          PropagateNan,            \
+                                                                          PropagateNan, \
-                                                                          UseIndex>(               \
+                                                                          UseIndex>(    \
-        std::vector<DeviceReducePtr<                                                               \
+        std::vector<deviceReduceMultiBlockAtomicAddPtrType<ReduceOpId>> & device_op_instances)
-            typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
-            typename reduce_unary_operator<compT, ReduceOpId, true, true>::                        \
-                AccElementwiseOperation>> &                                                        \
-        device_op_instances)
 #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(                                       \
    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)               \
@@ -202,7 +196,4 @@ void add_device_reduce_instance_multiblock_atomic_add(
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_B16_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F16_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -24,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "device_reduce_instance_multiblock_atomic_add.hpp"
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -23,7 +27,4 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "reduction_operator_mapping.hpp"
+#pragma once
-#include "device_reduce_instance_impl_common.hpp"
-#include "device_reduce_threadwise.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -47,10 +49,10 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
    >;
 #endif
-template <typename AccDataType, ReduceTensorOp ReduceOpId>
+template <ReduceTensorOp ReduceOpId>
 using deviceReduceThreadWisePtrType = DeviceReducePtr<
-    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
+    typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation,
-    typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
+    typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation>;
 template <typename InDataType,
          typename AccDataType,
@@ -61,14 +63,13 @@ template <typename InDataType,
          bool PropagateNan,
          bool UseIndex>
 void add_device_reduce_instance_threadwise(
-    std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
+    std::vector<deviceReduceThreadWisePtrType<ReduceOpId>>& device_op_instances)
 {
-    using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
    using InElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
    using AccElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
-            AccElementwiseOperation;
    constexpr bool Indexable =
        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
@@ -114,7 +115,7 @@ void add_device_reduce_instance_threadwise(
                                                        ReduceOpId,           \
                                                        PropagateNan,         \
                                                        UseIndex>(            \
-        std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
+        std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
 #define ADD_THREADWISE_INST_BY_ID(                                        \
    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
@@ -127,21 +128,17 @@ void add_device_reduce_instance_threadwise(
                                Rank,                                     \
                                NumReduceDim)
-#define ADD_THREADWISE_INST_REF_BY_TYPE(                                                           \
+#define ADD_THREADWISE_INST_REF_BY_TYPE(                                      \
-    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim)                      \
+    inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
-    extern template void add_device_reduce_instance_threadwise<inT,                                \
+    extern template void add_device_reduce_instance_threadwise<inT,           \
-                                                               compT,                              \
+                                                               compT,         \
-                                                               outT,                               \
+                                                               outT,          \
-                                                               Rank,                               \
+                                                               Rank,          \
-                                                               NumReduceDim,                       \
+                                                               NumReduceDim,  \
-                                                               ReduceOpId,                         \
+                                                               ReduceOpId,    \
-                                                               PropagateNan,                       \
+                                                               PropagateNan,  \
-                                                               UseIndex>(                          \
+                                                               UseIndex>(     \
-        std::vector<DeviceReducePtr<                                                               \
+        std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
-            typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
-            typename reduce_unary_operator<compT, ReduceOpId, true, true>::                        \
-                AccElementwiseOperation>> &                                                        \
-        device_op_instances)
 #define ADD_THREADWISE_INST_REF_BY_ID(                                       \
    inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)    \
@@ -157,7 +154,4 @@ void add_device_reduce_instance_threadwise(
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -53,7 +56,4 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp
-#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
+// SPDX-License-Identifier: MIT
-#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "data_type.hpp"
+#pragma once
-#include "device_reduce_instance_threadwise.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -40,7 +43,4 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
 } // namespace device_reduce_instance
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif