Commit b79df771 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents 05d38218 63914743
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F32_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); ...@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F32_F64_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); ...@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_ATOMIC_ADD_F64_F64_F64_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_multiblock_atomic_add.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); ...@@ -20,10 +24,7 @@ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "reduction_operator_mapping.hpp" #pragma once
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_threadwise.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
#ifdef QUICK_REDUCE_TEST #ifdef QUICK_REDUCE_TEST
using reduce_configuration_2_instances_threadwise = std::tuple< using reduce_configuration_2_instances_threadwise = std::tuple<
...@@ -47,10 +49,10 @@ using reduce_configuration_2_instances_threadwise = std::tuple< ...@@ -47,10 +49,10 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp ReduceOpId> template <ReduceTensorOp ReduceOpId>
using deviceReduceThreadWisePtrType = DeviceReducePtr< using deviceReduceThreadWisePtrType = DeviceReducePtr<
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation, typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation,
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>; typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation>;
template <typename InDataType, template <typename InDataType,
typename AccDataType, typename AccDataType,
...@@ -61,14 +63,13 @@ template <typename InDataType, ...@@ -61,14 +63,13 @@ template <typename InDataType,
bool PropagateNan, bool PropagateNan,
bool UseIndex> bool UseIndex>
void add_device_reduce_instance_threadwise( void add_device_reduce_instance_threadwise(
std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances) std::vector<deviceReduceThreadWisePtrType<ReduceOpId>>& device_op_instances)
{ {
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType; using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
using InElementwiseOperation = using InElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation; typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation = using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>:: typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
...@@ -114,7 +115,7 @@ void add_device_reduce_instance_threadwise( ...@@ -114,7 +115,7 @@ void add_device_reduce_instance_threadwise(
ReduceOpId, \ ReduceOpId, \
PropagateNan, \ PropagateNan, \
UseIndex>( \ UseIndex>( \
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances) std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
#define ADD_THREADWISE_INST_BY_ID( \ #define ADD_THREADWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
...@@ -127,21 +128,17 @@ void add_device_reduce_instance_threadwise( ...@@ -127,21 +128,17 @@ void add_device_reduce_instance_threadwise(
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
#define ADD_THREADWISE_INST_REF_BY_TYPE( \ #define ADD_THREADWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_threadwise<inT, \ extern template void add_device_reduce_instance_threadwise<inT, \
compT, \ compT, \
outT, \ outT, \
Rank, \ Rank, \
NumReduceDim, \ NumReduceDim, \
ReduceOpId, \ ReduceOpId, \
PropagateNan, \ PropagateNan, \
UseIndex>( \ UseIndex>( \
std::vector<DeviceReducePtr< \ std::vector<deviceReduceThreadWisePtrType<ReduceOpId>> & device_op_instances)
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_THREADWISE_INST_REF_BY_ID( \ #define ADD_THREADWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
...@@ -154,10 +151,7 @@ void add_device_reduce_instance_threadwise( ...@@ -154,10 +151,7 @@ void add_device_reduce_instance_threadwise(
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp" #pragma once
#include "device_reduce_instance_threadwise.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -50,10 +53,7 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); ...@@ -50,10 +53,7 @@ ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp" #pragma once
#include "device_reduce_instance_threadwise.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -37,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ...@@ -37,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "data_type.hpp" #pragma once
#include "device_reduce_instance_threadwise.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -25,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ...@@ -25,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); ...@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -24,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); ...@@ -24,10 +28,7 @@ ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); ...@@ -48,10 +52,7 @@ ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -20,10 +24,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); ...@@ -20,10 +24,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP // SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "device_reduce_instance_threadwise.hpp" #pragma once
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
...@@ -36,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); ...@@ -36,10 +40,7 @@ ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef CHECK_ERR_HPP // SPDX-License-Identifier: MIT
#define CHECK_ERR_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm> #pragma once
#include <cmath>
#include <cstdlib> #include <algorithm>
#include <half.hpp> #include <cmath>
#include <iostream> #include <cstdlib>
#include <iomanip> #include <iostream>
#include <iterator> #include <iomanip>
#include <limits> #include <iterator>
#include <type_traits> #include <limits>
#include <vector> #include <type_traits>
#include <vector>
#include "data_type.hpp"
#include "ck/utility/data_type.hpp"
namespace ck {
namespace utils { namespace ck {
namespace utils {
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value, template <typename T>
bool>::type typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
check_err(const std::vector<T>& out, bool>::type
const std::vector<T>& ref, check_err(const std::vector<T>& out,
const std::string& msg = "Error: Incorrect results!", const std::vector<T>& ref,
double rtol = 1e-5, const std::string& msg = "Error: Incorrect results!",
double atol = 3e-6) double rtol = 1e-5,
{ double atol = 3e-6)
if(out.size() != ref.size()) {
{ if(out.size() != ref.size())
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() {
<< std::endl std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< msg << std::endl; << std::endl
return false; << msg << std::endl;
} return false;
}
bool res{true};
int err_count = 0; bool res{true};
double err = 0; int err_count = 0;
double max_err = std::numeric_limits<double>::min(); double err = 0;
for(std::size_t i = 0; i < ref.size(); ++i) double max_err = std::numeric_limits<double>::min();
{ for(std::size_t i = 0; i < ref.size(); ++i)
err = std::abs(out[i] - ref[i]); {
if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i])) err = std::abs(out[i] - ref[i]);
{ if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
max_err = err > max_err ? err : max_err; {
err_count++; max_err = err > max_err ? err : max_err;
if(err_count < 5) err_count++;
{ if(err_count < 5)
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" {
<< i << "]: " << out[i] << " != " << ref[i] << std::endl std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
<< msg << std::endl; << i << "]: " << out[i] << " != " << ref[i] << std::endl
} << msg << std::endl;
res = false; }
} res = false;
} }
if(!res) }
{ if(!res)
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; {
} std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
return res; }
} return res;
}
template <typename T>
typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type template <typename T>
check_err(const std::vector<T>& out, typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
const std::vector<T>& ref, check_err(const std::vector<T>& out,
const std::string& msg = "Error: Incorrect results!", const std::vector<T>& ref,
double rtol = 1e-3, const std::string& msg = "Error: Incorrect results!",
double atol = 1e-3) double rtol = 1e-3,
{ double atol = 1e-3)
if(out.size() != ref.size()) {
{ if(out.size() != ref.size())
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() {
<< std::endl std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< msg << std::endl; << std::endl
return false; << msg << std::endl;
} return false;
}
bool res{true};
int err_count = 0; bool res{true};
double err = 0; int err_count = 0;
// TODO: This is a hack. We should have proper specialization for bhalf_t data type. double err = 0;
double max_err = std::numeric_limits<float>::min(); // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
for(std::size_t i = 0; i < ref.size(); ++i) double max_err = std::numeric_limits<float>::min();
{ for(std::size_t i = 0; i < ref.size(); ++i)
double o = type_convert<float>(out[i]); {
double r = type_convert<float>(ref[i]); double o = type_convert<float>(out[i]);
err = std::abs(o - r); double r = type_convert<float>(ref[i]);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) err = std::abs(o - r);
{ if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
max_err = err > max_err ? err : max_err; {
err_count++; max_err = err > max_err ? err : max_err;
if(err_count < 5) err_count++;
{ if(err_count < 5)
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" {
<< i << "]: " << o << " != " << r << std::endl std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
<< msg << std::endl; << i << "]: " << o << " != " << r << std::endl
} << msg << std::endl;
res = false; }
} res = false;
} }
if(!res) }
{ if(!res)
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; {
} std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
return res; }
} return res;
}
template <typename T>
typename std::enable_if<std::is_same<T, half_t>::value || std::is_same<T, half_float::half>::value, template <typename T>
bool>::type typename std::enable_if<std::is_same<T, half_t>::value, bool>::type
check_err(const std::vector<T>& out, check_err(const std::vector<T>& out,
const std::vector<T>& ref, const std::vector<T>& ref,
const std::string& msg = "Error: Incorrect results!", const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3, double rtol = 1e-3,
double atol = 1e-3) double atol = 1e-3)
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl << std::endl
<< msg << std::endl; << msg << std::endl;
return false; return false;
} }
bool res{true}; bool res{true};
int err_count = 0; int err_count = 0;
double err = 0; double err = 0;
double max_err = std::numeric_limits<T>::min(); double max_err = std::numeric_limits<T>::min();
for(std::size_t i = 0; i < ref.size(); ++i) for(std::size_t i = 0; i < ref.size(); ++i)
{ {
double o = type_convert<float>(out[i]); double o = type_convert<float>(out[i]);
double r = type_convert<float>(ref[i]); double r = type_convert<float>(ref[i]);
err = std::abs(o - r); err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{ {
max_err = err > max_err ? err : max_err; max_err = err > max_err ? err : max_err;
err_count++; err_count++;
if(err_count < 5) if(err_count < 5)
{ {
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
<< i << "]: " << o << " != " << r << std::endl << i << "]: " << o << " != " << r << std::endl
<< msg << std::endl; << msg << std::endl;
} }
res = false; res = false;
} }
} }
if(!res) if(!res)
{ {
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
} }
return res; return res;
} }
template <typename T> template <typename T>
typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
check_err(const std::vector<T>& out, check_err(const std::vector<T>& out,
const std::vector<T>& ref, const std::vector<T>& ref,
const std::string& msg = "Error: Incorrect results!", const std::string& msg = "Error: Incorrect results!",
double = 0, double = 0,
double = 0) double atol = 0)
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl << std::endl
<< msg << std::endl; << msg << std::endl;
return false; return false;
} }
for(std::size_t i = 0; i < ref.size(); ++i) bool res{true};
{ int err_count = 0;
if(out[i] != ref[i]) int64_t err = 0;
{ int64_t max_err = std::numeric_limits<int64_t>::min();
std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i]) for(std::size_t i = 0; i < ref.size(); ++i)
<< " != " << static_cast<int>(ref[i]) << std::endl {
<< msg << std::endl; int64_t o = out[i];
return false; int64_t r = ref[i];
} err = std::abs(o - r);
}
return true; if(err > atol)
} {
max_err = err > max_err ? err : max_err;
} // namespace utils err_count++;
} // namespace ck if(err_count < 5)
{
template <typename T> std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) << " != " << static_cast<int>(ref[i]) << std::endl
{ << msg << std::endl;
std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " ")); }
return os; res = false;
} }
}
#endif if(!res)
{
std::cout << "max err: " << max_err << std::endl;
}
return res;
}
} // namespace utils
} // namespace ck
template <typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
{
std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
return os;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <cstdlib> #include <cstdlib>
...@@ -9,17 +12,17 @@ ...@@ -9,17 +12,17 @@
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include "check_err.hpp" #include "ck/ck.hpp"
#include "config.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "device.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp"
#include "fill.hpp" #include "ck/library/utility/fill.hpp"
#include "host_tensor.hpp" #include "ck/library/utility/op_instance_engine.hpp"
#include "op_instance_engine.hpp" #include "ck/library/host_tensor/device_memory.hpp"
#include "reference_conv_fwd.hpp" #include "ck/library/host_tensor/host_tensor.hpp"
#include "tensor_layout.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -28,15 +31,15 @@ namespace device { ...@@ -28,15 +31,15 @@ namespace device {
using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<element_wise::PassThrough, using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<element_wise::PassThrough,
element_wise::PassThrough, element_wise::PassThrough,
element_wise::PassThrough>; element_wise::PassThrough>;
namespace device_conv1d_fwd_instance { namespace instance {
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv1d_fwd_instance } // namespace instance
namespace device_conv2d_fwd_instance { namespace instance {
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
...@@ -45,15 +48,15 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( ...@@ -45,15 +48,15 @@ void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv2d_fwd_instance } // namespace instance
namespace device_conv3d_fwd_instance { namespace instance {
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&); void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
} // namespace device_conv3d_fwd_instance } // namespace instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
...@@ -292,17 +295,17 @@ struct ConvolutionFwdInstances<float, float, float> ...@@ -292,17 +295,17 @@ struct ConvolutionFwdInstances<float, float, float>
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
if constexpr(NumDimSpatial == 1) if constexpr(NumDimSpatial == 1)
{ {
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs); add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
} }
else if constexpr(NumDimSpatial == 2) else if constexpr(NumDimSpatial == 2)
{ {
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
} }
else if constexpr(NumDimSpatial == 3) else if constexpr(NumDimSpatial == 3)
{ {
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs); add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
} }
return conv_ptrs; return conv_ptrs;
...@@ -319,20 +322,20 @@ struct ConvolutionFwdInstances<half_t, half_t, half_t> ...@@ -319,20 +322,20 @@ struct ConvolutionFwdInstances<half_t, half_t, half_t>
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
if constexpr(NumDimSpatial == 1) if constexpr(NumDimSpatial == 1)
{ {
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs); add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
return conv_ptrs; return conv_ptrs;
} }
else if constexpr(NumDimSpatial == 2) else if constexpr(NumDimSpatial == 2)
{ {
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
} }
else if constexpr(NumDimSpatial == 3) else if constexpr(NumDimSpatial == 3)
{ {
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs); add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
} }
return conv_ptrs; return conv_ptrs;
...@@ -349,17 +352,17 @@ struct ConvolutionFwdInstances<bhalf_t, bhalf_t, bhalf_t> ...@@ -349,17 +352,17 @@ struct ConvolutionFwdInstances<bhalf_t, bhalf_t, bhalf_t>
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
if constexpr(NumDimSpatial == 1) if constexpr(NumDimSpatial == 1)
{ {
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs); add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
} }
else if constexpr(NumDimSpatial == 2) else if constexpr(NumDimSpatial == 2)
{ {
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
} }
else if constexpr(NumDimSpatial == 3) else if constexpr(NumDimSpatial == 3)
{ {
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs); add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
} }
return conv_ptrs; return conv_ptrs;
...@@ -376,17 +379,17 @@ struct ConvolutionFwdInstances<int8_t, int8_t, int8_t> ...@@ -376,17 +379,17 @@ struct ConvolutionFwdInstances<int8_t, int8_t, int8_t>
std::vector<DeviceConvFwdNoOpPtr> conv_ptrs; std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
if constexpr(NumDimSpatial == 1) if constexpr(NumDimSpatial == 1)
{ {
ck::tensor_operation::device::device_conv1d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs); add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs);
} }
else if constexpr(NumDimSpatial == 2) else if constexpr(NumDimSpatial == 2)
{ {
ck::tensor_operation::device::device_conv2d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
} }
else if constexpr(NumDimSpatial == 3) else if constexpr(NumDimSpatial == 3)
{ {
ck::tensor_operation::device::device_conv3d_fwd_instance:: ck::tensor_operation::device::instance::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs); add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs);
} }
return conv_ptrs; return conv_ptrs;
...@@ -402,8 +405,8 @@ template <typename InDataType, ...@@ -402,8 +405,8 @@ template <typename InDataType,
typename InElementwiseOp = ck::tensor_operation::element_wise::PassThrough, typename InElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
typename WeiElementwiseOp = ck::tensor_operation::element_wise::PassThrough, typename WeiElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
typename OutElementwiseOp = ck::tensor_operation::element_wise::PassThrough, typename OutElementwiseOp = ck::tensor_operation::element_wise::PassThrough,
typename InputInitFun = FillUniform<InDataType>, typename InputInitFun = FillUniformDistribution<InDataType>,
typename WeightsInitFun = FillUniform<WeiDataType>> typename WeightsInitFun = FillUniformDistribution<WeiDataType>>
class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, WeiDataType> class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, WeiDataType>
{ {
using DeviceConvFwdOp = tensor_operation::device:: using DeviceConvFwdOp = tensor_operation::device::
...@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -422,8 +425,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
ConvFwdOpInstance(const ConvParams& params, ConvFwdOpInstance(const ConvParams& params,
bool do_init = true, bool do_init = true,
const InputInitFun& input_init_f = InputInitFun{}, const InputInitFun& input_init_f = InputInitFun(),
const WeightsInitFun& weights_init_f = WeightsInitFun{}) const WeightsInitFun& weights_init_f = WeightsInitFun())
: BaseType(), : BaseType(),
params_{params}, params_{params},
output_spatial_lengths_{params.GetOutputSpatialLengths()}, output_spatial_lengths_{params.GetOutputSpatialLengths()},
...@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -560,8 +563,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
const ConvParams& params_; const ConvParams& params_;
const std::vector<ck::index_t> output_spatial_lengths_; const std::vector<ck::index_t> output_spatial_lengths_;
const bool do_init_; const bool do_init_;
const InputInitFun& input_init_f_; InputInitFun input_init_f_;
const WeightsInitFun& weights_init_f_; WeightsInitFun weights_init_f_;
}; };
} // namespace conv } // namespace conv
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <cmath>
#include <random> #include <random>
#include "data_type.hpp" #include "ck/utility/data_type.hpp"
namespace ck { namespace ck {
namespace utils { namespace utils {
// template <typename T, class Enable = void> template <typename T>
// struct FillUniform; struct FillUniformDistribution
{
float a_{-5.f};
float b_{5.f};
// TODO: what's wrong with this specialization??? template <typename ForwardIter>
// err: segmentation fault in mt19937 - infinite loop like. void operator()(ForwardIter first, ForwardIter last) const
// template <typename T> {
// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value && std::mt19937 gen(11939);
// !std::is_same<T, bhalf_t>::value>::type> std::uniform_real_distribution<float> dis(a_, b_);
// { std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// int a_{0}; }
// int b_{5}; };
// // T a_ = T{0};
// // T b_ = T{5};
// template <typename ForwardIter> // Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
// void operator()(ForwardIter first, ForwardIter last) const // However this produces segfaults in std::mt19937 which look like inifite loop.
// { // template <typename T>
// std::mt19937 gen{11939}; // struct FillUniformDistributionIntegerValue
// std::uniform_int_distribution<int> dis(a_, b_); // {
// std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); }); // int a_{-5};
// } // int b_{5};
// }; //
// template <typename ForwardIter>
// void operator()(ForwardIter first, ForwardIter last) const
// {
// std::mt19937 gen(11939);
// std::uniform_int_distribution<int> dis(a_, b_);
// std::generate(
// first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
// }
// };
// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value || // Workaround for uniform_int_distribution not working as expected. See note above.<
// std::is_same<T, bhalf_t>::value>::type>
template <typename T> template <typename T>
struct FillUniform struct FillUniformDistributionIntegerValue
{ {
float a_{0}; float a_{-5.f};
float b_{5}; float b_{5.f};
template <typename ForwardIter> template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const void operator()(ForwardIter first, ForwardIter last) const
{ {
std::mt19937 gen{11939}; std::mt19937 gen(11939);
std::uniform_real_distribution<> dis(a_, b_); std::uniform_real_distribution<float> dis(a_, b_);
std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); }); std::generate(
first, last, [&dis, &gen]() { return ck::type_convert<T>(std::round(dis(gen))); });
} }
}; };
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <cstdlib> #include <cstdlib>
#include <iostream>
#include <limits> #include <limits>
#include <memory> #include <memory>
#include <stdexcept> #include <stdexcept>
...@@ -8,9 +12,12 @@ ...@@ -8,9 +12,12 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "check_err.hpp" #include "ck/utility/functional2.hpp"
#include "device_base.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "functional2.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
namespace ck { namespace ck {
namespace utils { namespace utils {
...@@ -78,7 +85,8 @@ class OpInstanceRunEngine ...@@ -78,7 +85,8 @@ class OpInstanceRunEngine
template <typename ReferenceOp = std::function<void()>> template <typename ReferenceOp = std::function<void()>>
OpInstanceRunEngine(const OpInstanceT& op_instance, OpInstanceRunEngine(const OpInstanceT& op_instance,
const ReferenceOp& reference_op = ReferenceOp{}) const ReferenceOp& reference_op = ReferenceOp{},
bool do_verification = true)
: op_instance_{op_instance} : op_instance_{op_instance}
{ {
in_tensors_ = op_instance_.GetInputTensors(); in_tensors_ = op_instance_.GetInputTensors();
...@@ -88,8 +96,11 @@ class OpInstanceRunEngine ...@@ -88,8 +96,11 @@ class OpInstanceRunEngine
const Tensor<InArgTypes>&..., const Tensor<InArgTypes>&...,
Tensor<OutDataType>&>) Tensor<OutDataType>&>)
{ {
ref_output_ = op_instance_.GetOutputTensor(); if(do_verification)
CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{}); {
ref_output_ = op_instance_.GetOutputTensor();
CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
}
} }
AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{}); AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
out_device_buffer_ = out_device_buffer_ =
...@@ -110,6 +121,7 @@ class OpInstanceRunEngine ...@@ -110,6 +121,7 @@ class OpInstanceRunEngine
op_ptr.get(), in_device_buffers_, out_device_buffer_); op_ptr.get(), in_device_buffers_, out_device_buffer_);
if(op_ptr->IsSupportedArgument(argument.get())) if(op_ptr->IsSupportedArgument(argument.get()))
{ {
std::cout << "Testing instance: " << op_ptr->GetTypeString() << std::endl;
invoker->Run(argument.get()); invoker->Run(argument.get());
out_device_buffer_->FromDevice(out_tensor_->mData.data()); out_device_buffer_->FromDevice(out_tensor_->mData.data());
if(!ref_output_) if(!ref_output_)
...@@ -119,9 +131,16 @@ class OpInstanceRunEngine ...@@ -119,9 +131,16 @@ class OpInstanceRunEngine
" You have to provide reference function."); " You have to provide reference function.");
} }
// TODO: enable flexible use of custom check_error functions // TODO: enable flexible use of custom check_error functions
res = res && check_err(out_tensor_->mData, ref_output_->mData); bool inst_res = CheckErr(out_tensor_->mData, ref_output_->mData);
std::cout << (inst_res ? "SUCCESS" : "FAILURE") << std::endl;
res = res && inst_res;
out_device_buffer_->SetZero(); out_device_buffer_->SetZero();
} }
else
{
std::cout << "Given conv problem is not supported by instance: \n\t>>>>"
<< op_ptr->GetTypeString() << std::endl;
}
} }
return res; return res;
} }
...@@ -132,7 +151,6 @@ class OpInstanceRunEngine ...@@ -132,7 +151,6 @@ class OpInstanceRunEngine
bool do_verification = false, bool do_verification = false,
bool do_log = false) bool do_log = false)
{ {
bool res{true};
ProfileBestConfig best_config; ProfileBestConfig best_config;
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -153,7 +171,7 @@ class OpInstanceRunEngine ...@@ -153,7 +171,7 @@ class OpInstanceRunEngine
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << op_name << std::endl; << " GB/s, " << op_name << std::endl;
if(tflops < best_config.best_tflops) if(avg_time < best_config.best_avg_time)
{ {
best_config.best_op_name = op_name; best_config.best_op_name = op_name;
best_config.best_tflops = tflops; best_config.best_tflops = tflops;
...@@ -171,7 +189,7 @@ class OpInstanceRunEngine ...@@ -171,7 +189,7 @@ class OpInstanceRunEngine
" You have to provide reference function."); " You have to provide reference function.");
} }
// TODO: enable flexible use of custom check_error functions // TODO: enable flexible use of custom check_error functions
res = res && CheckErr(out_tensor_->mData, ref_output_->mData); CheckErr(out_tensor_->mData, ref_output_->mData);
if(do_log) {} if(do_log) {}
} }
...@@ -223,7 +241,7 @@ class OpInstanceRunEngine ...@@ -223,7 +241,7 @@ class OpInstanceRunEngine
template <typename T> template <typename T>
bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
{ {
return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_); return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", rtol_, atol_);
} }
}; };
......
## host_tensor ## host_tensor
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
)
set(HOST_TENSOR_SOURCE set(HOST_TENSOR_SOURCE
device.cpp device_memory.cpp
host_tensor.cpp host_tensor.cpp
) )
...@@ -17,22 +11,20 @@ target_compile_features(host_tensor PUBLIC) ...@@ -17,22 +11,20 @@ target_compile_features(host_tensor PUBLIC)
set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>) target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
target_include_directories(host_tensor PUBLIC target_include_directories(host_tensor PUBLIC
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>" "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>" "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>" "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>"
) )
install(TARGETS host_tensor rocm_install(
EXPORT host_tensorTargets TARGETS host_tensor
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT host_tensorTargets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
) )
install(EXPORT host_tensorTargets rocm_install(
FILE composable_kernelhost_tensorTargets.cmake EXPORT host_tensorTargets
FILE composable_kernelhost_tensorTargets.cmake
NAMESPACE composable_kernel:: NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
) )
......
#include <chrono>
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include "device.hpp"
#ifndef CK_NOGPU
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
void DeviceMem::ToDevice(const void* p)
{
hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p)
{
hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
struct KernelTimerImpl
{
KernelTimerImpl()
{
hip_check_error(hipEventCreate(&mStart));
hip_check_error(hipEventCreate(&mEnd));
}
~KernelTimerImpl()
{
hip_check_error(hipEventDestroy(mStart));
hip_check_error(hipEventDestroy(mEnd));
}
void Start()
{
hip_check_error(hipDeviceSynchronize());
hip_check_error(hipEventRecord(mStart, nullptr));
}
void End()
{
hip_check_error(hipEventRecord(mEnd, nullptr));
hip_check_error(hipEventSynchronize(mEnd));
}
float GetElapsedTime() const
{
float time;
hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
return time;
}
hipEvent_t mStart, mEnd;
};
KernelTimer::KernelTimer() : impl(new KernelTimerImpl()) {}
KernelTimer::~KernelTimer() {}
void KernelTimer::Start() { impl->Start(); }
void KernelTimer::End() { impl->End(); }
float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
#endif
DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment)
: mMemSize(mem_size), mAlignment(alignment)
{
if(mem_size == 0)
{
mpDeviceBuf = nullptr;
}
else
{
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
// TODO: posix only
int rtn = posix_memalign(&mpDeviceBuf, alignment, mem_size);
assert(rtn == 0);
}
}
void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
void DeviceAlignedMemCPU::ToDevice(const void* p) { memcpy(mpDeviceBuf, p, mMemSize); }
void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize); }
void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
{
if(mpDeviceBuf != nullptr)
free(mpDeviceBuf);
}
struct WallTimerImpl
{
void Start() { mStart = std::chrono::high_resolution_clock::now(); }
void End() { mStop = std::chrono::high_resolution_clock::now(); }
float GetElapsedTime() const
{
return static_cast<float>(
std::chrono::duration_cast<std::chrono::microseconds>(mStop - mStart).count()) *
1e-3;
}
std::chrono::time_point<std::chrono::high_resolution_clock> mStart;
std::chrono::time_point<std::chrono::high_resolution_clock> mStop;
};
WallTimer::WallTimer() : impl(new WallTimerImpl()) {}
WallTimer::~WallTimer() {}
void WallTimer::Start() { impl->Start(); }
void WallTimer::End() { impl->End(); }
float WallTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/device_utility/hip_check_error.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
void DeviceMem::ToDevice(const void* p)
{
hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p)
{
hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cassert> #include <cassert>
#include "host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
void HostTensorDescriptor::CalculateStrides() void HostTensorDescriptor::CalculateStrides()
{ {
...@@ -50,25 +54,3 @@ std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc) ...@@ -50,25 +54,3 @@ std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
return os; return os;
} }
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os)
{
os << "dim " << desc.GetNumOfDimension() << ", ";
os << "lengths {";
LogRange(os, desc.GetLengths(), ", ");
os << "}, ";
os << "strides {";
LogRange(os, desc.GetStrides(), ", ");
os << "}" << std::endl;
}
#if 1
// FIXME: remove
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst)
{
for(std::size_t i = 0; i < src.mData.size(); ++i)
dst.mData[i] = ck::type_convert<float>(src.mData[i]);
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment