Unverified Commit 963e4a71 authored by rocking5566's avatar rocking5566 Committed by GitHub
Browse files

Merge branch 'develop' into conv_quant_int8

parents ad29b25b 0ee3aea1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&);
template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
......@@ -9,15 +10,11 @@ namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
// clang-format on
// clang-format on
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&);
template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&);
template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&);
template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
// clang-format on
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -18,57 +18,61 @@ namespace tensor_operation {
namespace device {
namespace instance {
template <int Rank, int NumReduceDim, int ReduceOpId, bool PropagateNan, bool UseIndex>
template <index_t Rank,
index_t NumReduceDim,
ReduceTensorOp ReduceOpId,
bool PropagateNan,
bool UseIndex>
struct ReduceDescription
{
static constexpr int Rank_ = Rank;
static constexpr int NumReduceDim_ = NumReduceDim;
static constexpr int ReduceOpId_ = ReduceOpId;
static constexpr int PropagateNan_ = PropagateNan;
static constexpr int UseIndex_ = UseIndex;
static constexpr index_t Rank_ = Rank;
static constexpr index_t NumReduceDim_ = NumReduceDim;
static constexpr ReduceTensorOp ReduceOpId_ = ReduceOpId;
static constexpr bool PropagateNan_ = PropagateNan;
static constexpr bool UseIndex_ = UseIndex;
};
using reduce_description_instances =
std::tuple<ReduceDescription<4, 3, 0, false, false>, // for ADD
ReduceDescription<4, 4, 0, false, false>,
ReduceDescription<4, 1, 0, false, false>,
ReduceDescription<2, 1, 0, false, false>,
ReduceDescription<4, 3, 5, false, false>, // for AVG
ReduceDescription<4, 4, 5, false, false>,
ReduceDescription<4, 1, 5, false, false>,
ReduceDescription<2, 1, 5, false, false>,
ReduceDescription<4, 3, 7, false, false>, // for NORM2
ReduceDescription<4, 4, 7, false, false>,
ReduceDescription<4, 1, 7, false, false>,
ReduceDescription<2, 1, 7, false, false>,
ReduceDescription<4, 3, 2, false, false>, // for MIN
ReduceDescription<4, 4, 2, false, false>,
ReduceDescription<4, 1, 2, false, false>,
ReduceDescription<2, 1, 2, false, false>,
ReduceDescription<4, 3, 3, false, false>, // for MAX
ReduceDescription<4, 4, 3, false, false>,
ReduceDescription<4, 1, 3, false, false>,
ReduceDescription<2, 1, 3, false, false>,
ReduceDescription<4, 3, 4, false, false>, // for AMAX
ReduceDescription<4, 4, 4, false, false>,
ReduceDescription<4, 1, 4, false, false>,
ReduceDescription<2, 1, 4, false, false>,
ReduceDescription<4, 3, 2, false, true>, // for MIN
ReduceDescription<4, 4, 2, false, true>,
ReduceDescription<4, 1, 2, false, true>,
ReduceDescription<2, 1, 2, false, true>,
ReduceDescription<4, 3, 3, false, true>, // for MAX
ReduceDescription<4, 4, 3, false, true>,
ReduceDescription<4, 1, 3, false, true>,
ReduceDescription<2, 1, 3, false, true>,
ReduceDescription<4, 3, 4, false, true>, // for AMAX
ReduceDescription<4, 4, 4, false, true>,
ReduceDescription<4, 1, 4, false, true>,
ReduceDescription<2, 1, 4, false, true>>;
std::tuple<ReduceDescription<4, 3, ReduceTensorOp::ADD, false, false>, // for ADD
ReduceDescription<4, 4, ReduceTensorOp::ADD, false, false>,
ReduceDescription<4, 1, ReduceTensorOp::ADD, false, false>,
ReduceDescription<2, 1, ReduceTensorOp::ADD, false, false>,
ReduceDescription<4, 3, ReduceTensorOp::AVG, false, false>, // for AVG
ReduceDescription<4, 4, ReduceTensorOp::AVG, false, false>,
ReduceDescription<4, 1, ReduceTensorOp::AVG, false, false>,
ReduceDescription<2, 1, ReduceTensorOp::AVG, false, false>,
ReduceDescription<4, 3, ReduceTensorOp::NORM2, false, false>, // for NORM2
ReduceDescription<4, 4, ReduceTensorOp::NORM2, false, false>,
ReduceDescription<4, 1, ReduceTensorOp::NORM2, false, false>,
ReduceDescription<2, 1, ReduceTensorOp::NORM2, false, false>,
ReduceDescription<4, 3, ReduceTensorOp::MIN, false, false>, // for MIN
ReduceDescription<4, 4, ReduceTensorOp::MIN, false, false>,
ReduceDescription<4, 1, ReduceTensorOp::MIN, false, false>,
ReduceDescription<2, 1, ReduceTensorOp::MIN, false, false>,
ReduceDescription<4, 3, ReduceTensorOp::MAX, false, false>, // for MAX
ReduceDescription<4, 4, ReduceTensorOp::MAX, false, false>,
ReduceDescription<4, 1, ReduceTensorOp::MAX, false, false>,
ReduceDescription<2, 1, ReduceTensorOp::MAX, false, false>,
ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, false>, // for AMAX
ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, false>,
ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, false>,
ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, false>,
ReduceDescription<4, 3, ReduceTensorOp::MIN, false, true>, // for MIN
ReduceDescription<4, 4, ReduceTensorOp::MIN, false, true>,
ReduceDescription<4, 1, ReduceTensorOp::MIN, false, true>,
ReduceDescription<2, 1, ReduceTensorOp::MIN, false, true>,
ReduceDescription<4, 3, ReduceTensorOp::MAX, false, true>, // for MAX
ReduceDescription<4, 4, ReduceTensorOp::MAX, false, true>,
ReduceDescription<4, 1, ReduceTensorOp::MAX, false, true>,
ReduceDescription<2, 1, ReduceTensorOp::MAX, false, true>,
ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, true>, // for AMAX
ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, true>,
ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, true>,
ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, true>>;
template <typename DescriptionType>
bool description_match(const DescriptionType& description,
......@@ -78,9 +82,8 @@ bool description_match(const DescriptionType& description,
bool PropagateNan,
bool UseIndex)
{
if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast<int>(ReduceOpId) ||
description.PropagateNan_ != static_cast<int>(PropagateNan) ||
description.UseIndex_ != static_cast<int>(UseIndex))
if(description.Rank_ != Rank || description.ReduceOpId_ != ReduceOpId ||
description.PropagateNan_ != PropagateNan || description.UseIndex_ != UseIndex)
return (false);
if(DescriptionType::NumReduceDim_ != reduceDims.size())
......@@ -99,11 +102,10 @@ bool description_match(const DescriptionType& description,
namespace ck {
namespace profiler {
template <index_t Rank, index_t NumReduceDim>
static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
template <int Rank, int NumReduceDim>
static inline std::array<int, Rank - NumReduceDim>
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
{
assert(NumReduceDim == reduceDims.size());
int reduceFlag = 0;
// flag the bits for the reduceDims
......@@ -112,13 +114,15 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
reduceFlag |= 1 << reduceDims[i];
};
std::vector<int> invariantDims;
std::array<int, Rank - NumReduceDim> invariantDims;
// collect invariant dimensions
int dim = 0;
for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) == 0)
{
invariantDims.push_back(i);
invariantDims[dim] = i;
dim++;
};
return invariantDims;
......@@ -137,7 +141,7 @@ bool profile_reduce_impl_impl(bool do_verification,
bool do_dumpout,
bool time_kernel,
const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims,
const std::array<int, NumReduceDim>& reduceDims,
float alpha,
float beta)
{
......@@ -145,6 +149,8 @@ bool profile_reduce_impl_impl(bool do_verification,
using namespace ck::tensor_operation::device::instance;
using ck::host_common::dumpBufferToFile;
constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
constexpr bool op_support_indices =
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX);
......@@ -279,28 +285,32 @@ bool profile_reduce_impl_impl(bool do_verification,
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length));
using DeviceReduceInstPtr0 =
DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>;
using DeviceReduceInstPtr =
DeviceReducePtr<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>;
std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
std::vector<DeviceReduceInstPtr> reduce_ptrs;
add_device_reduce_instance_threadwise<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
UseIndex>(reduce0_ptrs);
UseIndex>(reduce_ptrs);
add_device_reduce_instance_blockwise<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
UseIndex>(reduce0_ptrs);
UseIndex>(reduce_ptrs);
if constexpr(use_atomic_add)
{
......@@ -309,12 +319,14 @@ bool profile_reduce_impl_impl(bool do_verification,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
UseIndex>(reduce0_ptrs);
UseIndex>(reduce_ptrs);
}
if(reduce0_ptrs.empty())
if(reduce_ptrs.empty())
{
throw std::runtime_error("Wrong! No device REDUCE instance found");
};
......@@ -342,22 +354,22 @@ bool profile_reduce_impl_impl(bool do_verification,
acc_elementwise_op);
};
std::vector<ck::index_t> i_inLengths;
std::vector<ck::index_t> i_inStrides;
std::vector<ck::index_t> i_outLengths;
std::vector<ck::index_t> i_outStrides;
std::array<index_t, Rank> arrInLengths;
std::array<index_t, Rank> arrInStrides;
std::array<index_t, NumOutDim> arrOutLengths;
std::array<index_t, NumOutDim> arrOutStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end());
i_inStrides.assign(inStrides.begin(), inStrides.end());
i_outLengths.assign(outLengths.begin(), outLengths.end());
i_outStrides.assign(outStrides.begin(), outStrides.end());
std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin());
std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin());
std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin());
std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin());
for(auto& reduce_ptr : reduce0_ptrs)
for(auto& reduce_ptr : reduce_ptrs)
{
auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
auto argument_ptr = reduce_ptr->MakeArgumentPointer(arrInLengths,
arrInStrides,
arrOutLengths,
arrOutStrides,
reduceDims,
alpha,
beta,
......@@ -478,22 +490,25 @@ bool profile_reduce_impl(bool do_verification,
descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))
return;
pass = pass &&
profile_reduce_impl_impl<InDataType,
AccDataType,
OutDataType,
descType::Rank_,
descType::NumReduceDim_,
static_cast<ReduceTensorOp>(descType::ReduceOpId_),
static_cast<bool>(descType::PropagateNan_),
static_cast<bool>(descType::UseIndex_)>(do_verification,
init_method,
do_dumpout,
time_kernel,
inLengths,
reduceDims,
alpha,
beta);
std::array<ck::index_t, descType::NumReduceDim_> arrReduceDims;
std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
pass = pass && profile_reduce_impl_impl<InDataType,
AccDataType,
OutDataType,
descType::Rank_,
descType::NumReduceDim_,
static_cast<ReduceTensorOp>(descType::ReduceOpId_),
descType::PropagateNan_,
descType::UseIndex_>(do_verification,
init_method,
do_dumpout,
time_kernel,
inLengths,
arrReduceDims,
alpha,
beta);
matched = true;
});
......
......@@ -81,7 +81,7 @@ def parse_logfile(logfile):
StrideA=[]
StrideB=[]
StrideC=[]
if 'perf_gemm' in logfile:
if 'perf_gemm.log' in logfile:
for line in open(logfile):
if 'Best Perf' in line:
lst=line.split()
......@@ -120,14 +120,14 @@ def parse_logfile(logfile):
res = [x for _,x in sorted(zip(tests,tflops))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1))
#parse conv_fwd performance tests:
elif 'conv_fwd' in logfile:
#parse conv_fwd and conv_bwd performance tests:
elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile:
for line in open(logfile):
if 'tflops:' in line:
lst=line.split()
res.append(lst[1])
#parse all other performance tests:
elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile or 'conv_bwd_data' in logfile or 'gemm_bilinear' in logfile or 'reduction' in logfile:
elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile or 'gemm_bilinear' in logfile or 'reduction' in logfile:
for line in open(logfile):
if 'Best Perf' in line:
lst=line.split()
......@@ -149,7 +149,7 @@ def store_new_test_result(table_name, test_results, testlist, branch_name, node_
df=pd.DataFrame(data=[params],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Environment','Datetime'])
df_add=pd.DataFrame(data=[test_results],columns=testlist)
df=pd.concat([df,df_add],axis=1)
print("new test results dataframe:",df)
#print("new test results dataframe:",df)
df.to_sql(table_name,connection,if_exists='append',index=False)
return 0
......@@ -165,7 +165,7 @@ def compare_test_to_baseline(baseline,test,testlist):
print("test # ",i,"shows regression by {:.3f}%".format(
(float(test[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(test[i])/base_list[i]
if base_list[i]>0: ave_perf=ave_perf+float(test[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
......@@ -248,7 +248,7 @@ def main():
conn = sqlEngine.connect()
#save gemm performance tests:
if 'perf_gemm' in filename:
if 'perf_gemm.log' in filename:
#write the ck_gemm_test_params table only needed once the test set changes
#post_test_params(test_list,conn)
for i in range(1,len(results)+1):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment