Commit a3b4c5cb authored by wangshaojie6's avatar wangshaojie6
Browse files

merge develop branch and add gridwise pipeline v3

parents 48918ab9 1677cf70
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F16_F32_F16_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
// clang-format on
} // namespace device_reduce_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F32_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);
// clang-format on
} // namespace device_reduce_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F32_F64_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
// clang-format on
} // namespace device_reduce_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_F64_F64_F64_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);
// Will be moved to use MultiBlockAtomicAdd
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
// clang-format on
} // namespace device_reduce_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I32_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
// clang-format on
} // namespace device_reduce_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_MULTIBLOCK_PARTIAL_REDUCE_I8_I8_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
// clang-format on
} // namespace device_reduce_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
...@@ -58,8 +58,8 @@ template <typename InDataType, ...@@ -58,8 +58,8 @@ template <typename InDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation NanOpt, bool PropagateNan,
ReduceTensorIndices IndicesOpt> bool UseIndex>
void add_device_reduce_instance_threadwise( void add_device_reduce_instance_threadwise(
std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances) std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
{ {
...@@ -73,9 +73,7 @@ void add_device_reduce_instance_threadwise( ...@@ -73,9 +73,7 @@ void add_device_reduce_instance_threadwise(
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES); constexpr bool OutputIndex = Indexable && UseIndex;
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
using cfg1 = ReductionConfiguration_1<256, 256, 1>; using cfg1 = ReductionConfiguration_1<256, 256, 1>;
...@@ -93,10 +91,9 @@ void add_device_reduce_instance_threadwise( ...@@ -93,10 +91,9 @@ void add_device_reduce_instance_threadwise(
InElementwiseOperation, InElementwiseOperation,
AccElementwiseOperation, AccElementwiseOperation,
PropagateNan, PropagateNan,
NeedIndices, OutputIndex,
false, // HaveIndexInputIfOutputIndex
cfg1::BlockSize_, cfg1::BlockSize_,
cfg1::MThreadClusterSize_,
cfg1::KThreadClusterSize_,
cfg2::MThreadSliceSize_, cfg2::MThreadSliceSize_,
cfg2::KThreadSliceSize_, cfg2::KThreadSliceSize_,
cfg2::InSrcVectorDim_, cfg2::InSrcVectorDim_,
...@@ -107,54 +104,54 @@ void add_device_reduce_instance_threadwise( ...@@ -107,54 +104,54 @@ void add_device_reduce_instance_threadwise(
}); });
}; };
#define ADD_THREADWISE_INST_BY_TYPE( \ #define ADD_THREADWISE_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
template void add_device_reduce_instance_threadwise<inT, \ template void add_device_reduce_instance_threadwise<inT, \
compT, \ compT, \
outT, \ outT, \
Rank, \ Rank, \
NumReduceDim, \ NumReduceDim, \
ReduceOpId, \ ReduceOpId, \
NanOpt, \ PropagateNan, \
IndicesOpt>( \ UseIndex>( \
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances) std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_THREADWISE_INST_BY_ID( \ #define ADD_THREADWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_BY_TYPE(inT, \ ADD_THREADWISE_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \ static_cast<bool>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \ static_cast<bool>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
#define ADD_THREADWISE_INST_REF_BY_TYPE( \ #define ADD_THREADWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_threadwise<inT, \ extern template void add_device_reduce_instance_threadwise<inT, \
compT, \ compT, \
outT, \ outT, \
Rank, \ Rank, \
NumReduceDim, \ NumReduceDim, \
ReduceOpId, \ ReduceOpId, \
NanOpt, \ PropagateNan, \
IndicesOpt>( \ UseIndex>( \
std::vector<DeviceReducePtr< \ std::vector<DeviceReducePtr< \
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \ typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \ typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
AccElementwiseOperation>> & \ AccElementwiseOperation>> & \
device_op_instances) device_op_instances)
#define ADD_THREADWISE_INST_REF_BY_ID( \ #define ADD_THREADWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation>(NanOpt), \ static_cast<bool>(NanOpt), \
static_cast<ReduceTensorIndices>(IndicesOpt), \ static_cast<bool>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_B16_F32_B16_HPP
#include "reduction_enums.hpp" #include "data_type.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp" #include "device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
......
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F16_F16_HPP
#include "reduction_enums.hpp" #include "data_type.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp" #include "device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
......
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F16_F32_F16_HPP
#include "reduction_enums.hpp" #include "data_type.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp" #include "device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
......
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F32_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp" #include "device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
......
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F32_F64_F32_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp" #include "device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
......
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_F64_F64_F64_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp" #include "device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
......
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I32_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp" #include "device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
......
#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP #ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_I8_I8_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_threadwise.hpp" #include "device_reduce_instance_threadwise.hpp"
namespace ck { namespace ck {
......
...@@ -24,7 +24,7 @@ check_err(const std::vector<T>& out, ...@@ -24,7 +24,7 @@ check_err(const std::vector<T>& out,
const std::vector<T>& ref, const std::vector<T>& ref,
const std::string& msg = "Error: Incorrect results!", const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-5, double rtol = 1e-5,
double atol = 1e-8) double atol = 3e-6)
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
...@@ -173,8 +173,8 @@ check_err(const std::vector<T>& out, ...@@ -173,8 +173,8 @@ check_err(const std::vector<T>& out,
{ {
if(out[i] != ref[i]) if(out[i] != ref[i])
{ {
std::cout << "out[" << i << "] != ref[" << i << "]: " << out[i] << " != " << ref[i] std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
<< std::endl << " != " << static_cast<int>(ref[i]) << std::endl
<< msg << std::endl; << msg << std::endl;
return false; return false;
} }
......
...@@ -146,19 +146,19 @@ struct ConvParams ...@@ -146,19 +146,19 @@ struct ConvParams
const std::vector<ck::index_t>& left_pads, const std::vector<ck::index_t>& left_pads,
const std::vector<ck::index_t>& right_pads); const std::vector<ck::index_t>& right_pads);
ck::index_t num_dim_spatial; ck::index_t num_dim_spatial_;
ck::index_t N; ck::index_t N_;
ck::index_t K; ck::index_t K_;
ck::index_t C; ck::index_t C_;
std::vector<ck::index_t> filter_spatial_lengths; std::vector<ck::index_t> filter_spatial_lengths_;
std::vector<ck::index_t> input_spatial_lengths; std::vector<ck::index_t> input_spatial_lengths_;
std::vector<ck::index_t> conv_filter_strides; std::vector<ck::index_t> conv_filter_strides_;
std::vector<ck::index_t> conv_filter_dilations; std::vector<ck::index_t> conv_filter_dilations_;
std::vector<ck::index_t> input_left_pads; std::vector<ck::index_t> input_left_pads_;
std::vector<ck::index_t> input_right_pads; std::vector<ck::index_t> input_right_pads_;
std::vector<ck::index_t> GetOutputSpatialLengths() const; std::vector<ck::index_t> GetOutputSpatialLengths() const;
}; };
...@@ -268,10 +268,10 @@ void run_reference_convolution_forward(const ConvParams& params, ...@@ -268,10 +268,10 @@ void run_reference_convolution_forward(const ConvParams& params,
auto ref_argument = ref_conv.MakeArgument(input, auto ref_argument = ref_conv.MakeArgument(input,
weights, weights,
output, output,
params.conv_filter_strides, params.conv_filter_strides_,
params.conv_filter_dilations, params.conv_filter_dilations_,
params.input_left_pads, params.input_left_pads_,
params.input_right_pads, params.input_right_pads_,
PassThrough{}, PassThrough{},
PassThrough{}, PassThrough{},
PassThrough{}); PassThrough{});
...@@ -437,17 +437,17 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -437,17 +437,17 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
virtual InTensorsTuple GetInputTensors() const override virtual InTensorsTuple GetInputTensors() const override
{ {
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params_.N), std::vector<std::size_t> input_dims{static_cast<std::size_t>(params_.N_),
static_cast<std::size_t>(params_.C)}; static_cast<std::size_t>(params_.C_)};
input_dims.insert(std::end(input_dims), input_dims.insert(std::end(input_dims),
std::begin(params_.input_spatial_lengths), std::begin(params_.input_spatial_lengths_),
std::end(params_.input_spatial_lengths)); std::end(params_.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params_.K), std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params_.K_),
static_cast<std::size_t>(params_.C)}; static_cast<std::size_t>(params_.C_)};
filter_dims.insert(std::end(filter_dims), filter_dims.insert(std::end(filter_dims),
std::begin(params_.filter_spatial_lengths), std::begin(params_.filter_spatial_lengths_),
std::end(params_.filter_spatial_lengths)); std::end(params_.filter_spatial_lengths_));
auto input = std::make_unique<Tensor<InDataType>>( auto input = std::make_unique<Tensor<InDataType>>(
get_host_tensor_descriptor(input_dims, InLayout{})); get_host_tensor_descriptor(input_dims, InLayout{}));
...@@ -465,8 +465,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -465,8 +465,8 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
virtual TensorPtr<OutDataType> GetOutputTensor() const override virtual TensorPtr<OutDataType> GetOutputTensor() const override
{ {
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params_.N), std::vector<std::size_t> output_dims{static_cast<std::size_t>(params_.N_),
static_cast<std::size_t>(params_.K)}; static_cast<std::size_t>(params_.K_)};
output_dims.insert(std::end(output_dims), output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths_), std::begin(output_spatial_lengths_),
std::end(output_spatial_lengths_)); std::end(output_spatial_lengths_));
...@@ -522,16 +522,16 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -522,16 +522,16 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
static_cast<InDataType*>(in_device_buffers[0]->GetDeviceBuffer()), static_cast<InDataType*>(in_device_buffers[0]->GetDeviceBuffer()),
static_cast<WeiDataType*>(in_device_buffers[1]->GetDeviceBuffer()), static_cast<WeiDataType*>(in_device_buffers[1]->GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buffer->GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buffer->GetDeviceBuffer()),
params_.N, params_.N_,
params_.K, params_.K_,
params_.C, params_.C_,
params_.input_spatial_lengths, params_.input_spatial_lengths_,
params_.filter_spatial_lengths, params_.filter_spatial_lengths_,
output_spatial_lengths_, output_spatial_lengths_,
params_.conv_filter_strides, params_.conv_filter_strides_,
params_.conv_filter_dilations, params_.conv_filter_dilations_,
params_.input_left_pads, params_.input_left_pads_,
params_.input_right_pads, params_.input_right_pads_,
InElementwiseOp{}, InElementwiseOp{},
WeiElementwiseOp{}, WeiElementwiseOp{},
OutElementwiseOp{}); OutElementwiseOp{});
...@@ -539,20 +539,20 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType, ...@@ -539,20 +539,20 @@ class ConvFwdOpInstance : public ck::utils::OpInstance<OutDataType, InDataType,
virtual std::size_t GetFlops() const override virtual std::size_t GetFlops() const override
{ {
return get_flops(params_.N, return get_flops(params_.N_,
params_.C, params_.C_,
params_.K, params_.K_,
params_.filter_spatial_lengths, params_.filter_spatial_lengths_,
output_spatial_lengths_); output_spatial_lengths_);
} }
virtual std::size_t GetBtype() const override virtual std::size_t GetBtype() const override
{ {
return get_btype<InDataType, WeiDataType, OutDataType>(params_.N, return get_btype<InDataType, WeiDataType, OutDataType>(params_.N_,
params_.C, params_.C_,
params_.K, params_.K_,
params_.input_spatial_lengths, params_.input_spatial_lengths_,
params_.filter_spatial_lengths, params_.filter_spatial_lengths_,
output_spatial_lengths_); output_spatial_lengths_);
} }
......
...@@ -128,7 +128,7 @@ class OpInstanceRunEngine ...@@ -128,7 +128,7 @@ class OpInstanceRunEngine
template <typename OpInstancePtr> template <typename OpInstancePtr>
ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs, ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
int nrepeat = 100, bool time_kernel = false,
bool do_verification = false, bool do_verification = false,
bool do_log = false) bool do_log = false)
{ {
...@@ -143,7 +143,7 @@ class OpInstanceRunEngine ...@@ -143,7 +143,7 @@ class OpInstanceRunEngine
if(op_ptr->IsSupportedArgument(argument.get())) if(op_ptr->IsSupportedArgument(argument.get()))
{ {
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float avg_time = invoker->Run(argument.get(), nrepeat); float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flops = op_instance_.GetFlops(); std::size_t flops = op_instance_.GetFlops();
std::size_t num_btype = op_instance_.GetBtype(); std::size_t num_btype = op_instance_.GetBtype();
......
...@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE ...@@ -10,10 +10,31 @@ set(HOST_TENSOR_SOURCE
host_tensor.cpp host_tensor.cpp
) )
add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) add_library(host_tensor STATIC ${HOST_TENSOR_SOURCE})
add_library(composable_kernel::host_tensor ALIAS host_tensor)
target_compile_features(host_tensor PUBLIC) target_compile_features(host_tensor PUBLIC)
set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(host_tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>) target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
install(TARGETS host_tensor LIBRARY DESTINATION lib)
target_include_directories(host_tensor PUBLIC
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/host_tensor>"
)
install(TARGETS host_tensor
EXPORT host_tensorTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
install(EXPORT host_tensorTargets
FILE composable_kernelhost_tensorTargets.cmake
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
)
clang_tidy_check(host_tensor) clang_tidy_check(host_tensor)
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{ {
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize)); hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
} }
void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
...@@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; } ...@@ -11,49 +11,48 @@ std::size_t DeviceMem::GetBufferSize() { return mMemSize; }
void DeviceMem::ToDevice(const void* p) void DeviceMem::ToDevice(const void* p)
{ {
hipGetErrorString( hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
} }
void DeviceMem::FromDevice(void* p) void DeviceMem::FromDevice(void* p)
{ {
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
} }
void DeviceMem::SetZero() { hipGetErrorString(hipMemset(mpDeviceBuf, 0, mMemSize)); } void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); } DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
struct KernelTimerImpl struct KernelTimerImpl
{ {
KernelTimerImpl() KernelTimerImpl()
{ {
hipGetErrorString(hipEventCreate(&mStart)); hip_check_error(hipEventCreate(&mStart));
hipGetErrorString(hipEventCreate(&mEnd)); hip_check_error(hipEventCreate(&mEnd));
} }
~KernelTimerImpl() ~KernelTimerImpl()
{ {
hipGetErrorString(hipEventDestroy(mStart)); hip_check_error(hipEventDestroy(mStart));
hipGetErrorString(hipEventDestroy(mEnd)); hip_check_error(hipEventDestroy(mEnd));
} }
void Start() void Start()
{ {
hipGetErrorString(hipDeviceSynchronize()); hip_check_error(hipDeviceSynchronize());
hipGetErrorString(hipEventRecord(mStart, nullptr)); hip_check_error(hipEventRecord(mStart, nullptr));
} }
void End() void End()
{ {
hipGetErrorString(hipEventRecord(mEnd, nullptr)); hip_check_error(hipEventRecord(mEnd, nullptr));
hipGetErrorString(hipEventSynchronize(mEnd)); hip_check_error(hipEventSynchronize(mEnd));
} }
float GetElapsedTime() const float GetElapsedTime() const
{ {
float time; float time;
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd)); hip_check_error(hipEventElapsedTime(&time, mStart, mEnd));
return time; return time;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment