Commit 07a673c6 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents c0f698d5 ac0d8066
...@@ -27,7 +27,7 @@ template <ck::index_t BlockSize, ...@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
ck::index_t ABlockTransferDstScalarPerVector_E2, ck::index_t ABlockTransferDstScalarPerVector_E2,
ck::index_t BThreadTransferSrcScalarPerVector_E2, ck::index_t BThreadTransferSrcScalarPerVector_E2,
ck::index_t CThreadTransferDstScalarPerVector_K, ck::index_t CThreadTransferDstScalarPerVector_K,
ck::ActivTypeEnum_t activ_type> ck::ActivTypeEnum activ_type>
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
{ {
template <typename... Wei, template <typename... Wei,
...@@ -305,7 +305,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -305,7 +305,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
FloatAB, FloatAB,
FloatAcc, FloatAcc,
FloatC, FloatC,
InMemoryDataOperationEnum_t::Set, InMemoryDataOperationEnum::Set,
decltype(a_e0_e1_k_e2_grid_desc), decltype(a_e0_e1_k_e2_grid_desc),
decltype(b_e0_e1_n_ho_wo_e2_grid_desc), decltype(b_e0_e1_n_ho_wo_e2_grid_desc),
decltype(c_k_n_hop_wop_grid_desc), decltype(c_k_n_hop_wop_grid_desc),
......
...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize, ...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
typename AKMGridDesc, typename AKMGridDesc,
typename BKNGridDesc, typename BKNGridDesc,
typename CMNGridDesc, typename CMNGridDesc,
......
...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize, ...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
typename AK0MK1GridDesc, typename AK0MK1GridDesc,
typename BK0NK1GridDesc, typename BK0NK1GridDesc,
typename CMNGridDesc, typename CMNGridDesc,
......
...@@ -11,7 +11,7 @@ template <ck::index_t BlockSize, ...@@ -11,7 +11,7 @@ template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
typename AGridDesc_K0_M_K1, typename AGridDesc_K0_M_K1,
typename BGridDesc_K0_N_K, typename BGridDesc_K0_N_K,
typename CMNGridDesc, typename CMNGridDesc,
......
...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize, ...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
typename ABK0MK1GridDesc, typename ABK0MK1GridDesc,
typename BBK0NK1GridDesc, typename BBK0NK1GridDesc,
typename CMNGridDesc, typename CMNGridDesc,
......
...@@ -17,7 +17,7 @@ template <typename InDataType, ...@@ -17,7 +17,7 @@ template <typename InDataType,
typename InElementwiseOperation, typename InElementwiseOperation,
typename WeiElementwiseOperation, typename WeiElementwiseOperation,
typename OutElementwiseOperation> typename OutElementwiseOperation>
struct ReferenceConvWrw : public device::BaseOperator struct ReferenceConvBwdWeight : public device::BaseOperator
{ {
// Argument // Argument
struct Argument : public device::BaseArgument struct Argument : public device::BaseArgument
...@@ -62,7 +62,7 @@ struct ReferenceConvWrw : public device::BaseOperator ...@@ -62,7 +62,7 @@ struct ReferenceConvWrw : public device::BaseOperator
// Invoker // Invoker
struct Invoker : public device::BaseInvoker struct Invoker : public device::BaseInvoker
{ {
using Argument = ReferenceConvWrw::Argument; using Argument = ReferenceConvBwdWeight::Argument;
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
...@@ -163,7 +163,7 @@ struct ReferenceConvWrw : public device::BaseOperator ...@@ -163,7 +163,7 @@ struct ReferenceConvWrw : public device::BaseOperator
auto str = std::stringstream(); auto str = std::stringstream();
// clang-format off // clang-format off
str << "ReferenceConvFwd" str << "ReferenceConvBwdWeight"
<< std::endl; << std::endl;
// clang-format on // clang-format on
......
...@@ -19,7 +19,7 @@ template <typename InDataType, ...@@ -19,7 +19,7 @@ template <typename InDataType,
typename WeiElementwiseOperation, typename WeiElementwiseOperation,
typename OutElementwiseOperation, typename OutElementwiseOperation,
ck::index_t NumDimSpatial = 2, ck::index_t NumDimSpatial = 2,
typename std::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false> typename ck::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
struct ReferenceConvBwdData : public device::BaseOperator struct ReferenceConvBwdData : public device::BaseOperator
{ {
// Argument // Argument
...@@ -71,7 +71,7 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -71,7 +71,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
{ {
if constexpr(NumDimSpatial == 1) if constexpr(NumDimSpatial == 1)
{ {
auto f_nchw = [&](auto n, auto c, auto wi) { auto f_ncw = [&](auto n, auto c, auto wi) {
std::size_t K = arg.weight_.mDesc.GetLengths()[0]; std::size_t K = arg.weight_.mDesc.GetLengths()[0];
std::size_t X = arg.weight_.mDesc.GetLengths()[2]; std::size_t X = arg.weight_.mDesc.GetLengths()[2];
std::size_t Wo = arg.output_.mDesc.GetLengths()[2]; std::size_t Wo = arg.output_.mDesc.GetLengths()[2];
...@@ -108,7 +108,7 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -108,7 +108,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
arg.input_(n, c, wi) = ck::type_convert<InDataType>(v_in); arg.input_(n, c, wi) = ck::type_convert<InDataType>(v_in);
}; };
make_ParallelTensorFunctor(f_nchw, make_ParallelTensorFunctor(f_ncw,
arg.input_.mDesc.GetLengths()[0], arg.input_.mDesc.GetLengths()[0],
arg.input_.mDesc.GetLengths()[1], arg.input_.mDesc.GetLengths()[1],
arg.input_.mDesc.GetLengths()[2])( arg.input_.mDesc.GetLengths()[2])(
...@@ -182,7 +182,7 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -182,7 +182,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
} }
else if constexpr(NumDimSpatial == 3) else if constexpr(NumDimSpatial == 3)
{ {
auto f_nchw = [&](auto n, auto c, auto di, auto hi, auto wi) { auto f_ncdhw = [&](auto n, auto c, auto di, auto hi, auto wi) {
std::size_t K = arg.weight_.mDesc.GetLengths()[0]; std::size_t K = arg.weight_.mDesc.GetLengths()[0];
std::size_t Z = arg.weight_.mDesc.GetLengths()[2]; std::size_t Z = arg.weight_.mDesc.GetLengths()[2];
std::size_t Y = arg.weight_.mDesc.GetLengths()[3]; std::size_t Y = arg.weight_.mDesc.GetLengths()[3];
...@@ -252,7 +252,7 @@ struct ReferenceConvBwdData : public device::BaseOperator ...@@ -252,7 +252,7 @@ struct ReferenceConvBwdData : public device::BaseOperator
arg.input_(n, c, di, hi, wi) = ck::type_convert<InDataType>(v_in); arg.input_(n, c, di, hi, wi) = ck::type_convert<InDataType>(v_in);
}; };
make_ParallelTensorFunctor(f_nchw, make_ParallelTensorFunctor(f_ncdhw,
arg.input_.mDesc.GetLengths()[0], arg.input_.mDesc.GetLengths()[0],
arg.input_.mDesc.GetLengths()[1], arg.input_.mDesc.GetLengths()[1],
arg.input_.mDesc.GetLengths()[2], arg.input_.mDesc.GetLengths()[2],
......
...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_blockwise = std::tuple< ...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
using deviceReduceBlockWisePtrType = DeviceReducePtr< using deviceReduceBlockWisePtrType = DeviceReducePtr<
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation, typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>; typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
...@@ -57,9 +57,9 @@ template <typename InDataType, ...@@ -57,9 +57,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_blockwise( void add_device_reduce_instance_blockwise(
std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances) std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
{ {
...@@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise( ...@@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) { static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
using cfg1 = using cfg1 =
...@@ -128,9 +128,9 @@ void add_device_reduce_instance_blockwise( ...@@ -128,9 +128,9 @@ void add_device_reduce_instance_blockwise(
ADD_BLOCKWISE_INST_BY_TYPE(inT, \ ADD_BLOCKWISE_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
...@@ -155,9 +155,9 @@ void add_device_reduce_instance_blockwise( ...@@ -155,9 +155,9 @@ void add_device_reduce_instance_blockwise(
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
......
...@@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple< ...@@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr< using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr<
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::InElementwiseOperation, typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::InElementwiseOperation,
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::AccElementwiseOperation>; typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::AccElementwiseOperation>;
...@@ -44,9 +44,9 @@ template <typename InDataType, ...@@ -44,9 +44,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_blockwise_second_call( void add_device_reduce_instance_blockwise_second_call(
std::vector<deviceReduceBlockWiseSecondCallPtrType<AccDataType, ReduceOpId>>& std::vector<deviceReduceBlockWiseSecondCallPtrType<AccDataType, ReduceOpId>>&
device_op_instances) device_op_instances)
...@@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call( ...@@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
static_assert(std::is_same<InDataType, AccDataType>::value, static_assert(std::is_same<InDataType, AccDataType>::value,
"InDataType and AccDataType should be the same to use " "InDataType and AccDataType should be the same to use "
...@@ -122,9 +122,9 @@ void add_device_reduce_instance_blockwise_second_call( ...@@ -122,9 +122,9 @@ void add_device_reduce_instance_blockwise_second_call(
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \ ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
...@@ -150,9 +150,9 @@ void add_device_reduce_instance_blockwise_second_call( ...@@ -150,9 +150,9 @@ void add_device_reduce_instance_blockwise_second_call(
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \ ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
......
...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< ...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOperation> template <typename AccDataType, ReduceTensorOp ReduceOperation>
using deviceReduceMultiBlockAtomicAddPtrType = using deviceReduceMultiBlockAtomicAddPtrType =
DeviceReducePtr<typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>:: DeviceReducePtr<typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>::
InElementwiseOperation, InElementwiseOperation,
...@@ -59,9 +59,9 @@ template <typename InDataType, ...@@ -59,9 +59,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_multiblock_atomic_add( void add_device_reduce_instance_multiblock_atomic_add(
std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>& std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>&
device_op_instances) device_op_instances)
...@@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add( ...@@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
static_assert(IndicesOpt == ReduceTensorIndices_t::NO_INDICES, static_assert(IndicesOpt == ReduceTensorIndices::NO_INDICES,
"AtomicAdd can only be used with reduction operations without indices!"); "AtomicAdd can only be used with reduction operations without indices!");
constexpr bool op_acceptable = constexpr bool op_acceptable =
(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::MUL || (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL ||
ReduceOpId == ReduceTensorOp_t::AVG || ReduceOpId == ReduceTensorOp_t::NORM1); ReduceOpId == ReduceTensorOp::AVG || ReduceOpId == ReduceTensorOp::NORM1);
constexpr bool out_type_acceptable = constexpr bool out_type_acceptable =
(std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value); (std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value);
...@@ -149,9 +149,9 @@ void add_device_reduce_instance_multiblock_atomic_add( ...@@ -149,9 +149,9 @@ void add_device_reduce_instance_multiblock_atomic_add(
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
...@@ -176,9 +176,9 @@ void add_device_reduce_instance_multiblock_atomic_add( ...@@ -176,9 +176,9 @@ void add_device_reduce_instance_multiblock_atomic_add(
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
......
...@@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple< ...@@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr< using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr<
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::InElementwiseOperation, typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::InElementwiseOperation,
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::AccElementwiseOperation>; typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::AccElementwiseOperation>;
...@@ -56,9 +56,9 @@ template <typename InDataType, ...@@ -56,9 +56,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_multiblock_partial_reduce( void add_device_reduce_instance_multiblock_partial_reduce(
std::vector<deviceReduceMultiBlockPartialReducePtrType<AccDataType, ReduceOpId>>& std::vector<deviceReduceMultiBlockPartialReducePtrType<AccDataType, ReduceOpId>>&
device_op_instances) device_op_instances)
...@@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce( ...@@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) { static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
using cfg1 = using cfg1 =
...@@ -131,9 +131,9 @@ void add_device_reduce_instance_multiblock_partial_reduce( ...@@ -131,9 +131,9 @@ void add_device_reduce_instance_multiblock_partial_reduce(
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \ ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
...@@ -159,9 +159,9 @@ void add_device_reduce_instance_multiblock_partial_reduce( ...@@ -159,9 +159,9 @@ void add_device_reduce_instance_multiblock_partial_reduce(
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \ ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
......
...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple< ...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
using deviceReduceThreadWisePtrType = DeviceReducePtr< using deviceReduceThreadWisePtrType = DeviceReducePtr<
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation, typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>; typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
...@@ -57,9 +57,9 @@ template <typename InDataType, ...@@ -57,9 +57,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_threadwise( void add_device_reduce_instance_threadwise(
std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances) std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
{ {
...@@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise( ...@@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
using cfg1 = ReductionConfiguration_1<256, 256, 1>; using cfg1 = ReductionConfiguration_1<256, 256, 1>;
...@@ -124,9 +124,9 @@ void add_device_reduce_instance_threadwise( ...@@ -124,9 +124,9 @@ void add_device_reduce_instance_threadwise(
ADD_THREADWISE_INST_BY_TYPE(inT, \ ADD_THREADWISE_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
...@@ -151,9 +151,9 @@ void add_device_reduce_instance_threadwise( ...@@ -151,9 +151,9 @@ void add_device_reduce_instance_threadwise(
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
......
#ifndef TEST_UTIL_HPP #ifndef CHECK_ERR_HPP
#define TEST_UTIL_HPP #define CHECK_ERR_HPP
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstdlib> #include <cstdlib>
#include <half.hpp>
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>
#include <iterator> #include <iterator>
...@@ -13,14 +14,15 @@ ...@@ -13,14 +14,15 @@
#include "data_type.hpp" #include "data_type.hpp"
namespace test { namespace ck {
namespace utils {
template <typename T> template <typename T>
typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, ck::half_t>::value, typename std::enable_if<std::is_floating_point<T>::value && !std::is_same<T, half_t>::value,
bool>::type bool>::type
check_err(const std::vector<T>& out, check_err(const std::vector<T>& out,
const std::vector<T>& ref, const std::vector<T>& ref,
const std::string& msg, const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-5, double rtol = 1e-5,
double atol = 1e-8) double atol = 1e-8)
{ {
...@@ -60,13 +62,12 @@ check_err(const std::vector<T>& out, ...@@ -60,13 +62,12 @@ check_err(const std::vector<T>& out,
} }
template <typename T> template <typename T>
typename std::enable_if<std::is_same<T, ck::bhalf_t>::value || std::is_same<T, ck::half_t>::value, typename std::enable_if<std::is_same<T, bhalf_t>::value, bool>::type
bool>::type
check_err(const std::vector<T>& out, check_err(const std::vector<T>& out,
const std::vector<T>& ref, const std::vector<T>& ref,
const std::string& msg, const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-5, double rtol = 1e-3,
double atol = 1e-8) double atol = 1e-3)
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
...@@ -79,11 +80,12 @@ check_err(const std::vector<T>& out, ...@@ -79,11 +80,12 @@ check_err(const std::vector<T>& out,
bool res{true}; bool res{true};
int err_count = 0; int err_count = 0;
double err = 0; double err = 0;
double max_err = ck::type_convert<float>(ck::NumericLimits<T>::Min()); // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
double max_err = std::numeric_limits<float>::min();
for(std::size_t i = 0; i < ref.size(); ++i) for(std::size_t i = 0; i < ref.size(); ++i)
{ {
float o = ck::type_convert<float>(out[i]); double o = type_convert<float>(out[i]);
float r = ck::type_convert<float>(ref[i]); double r = type_convert<float>(ref[i]);
err = std::abs(o - r); err = std::abs(o - r);
if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{ {
...@@ -105,11 +107,14 @@ check_err(const std::vector<T>& out, ...@@ -105,11 +107,14 @@ check_err(const std::vector<T>& out,
return res; return res;
} }
bool check_err(const std::vector<ck::half_t>& out, template <typename T>
const std::vector<ck::half_t>& ref, typename std::enable_if<std::is_same<T, half_t>::value || std::is_same<T, half_float::half>::value,
const std::string& msg, bool>::type
ck::half_t rtol = static_cast<ck::half_t>(1e-3f), check_err(const std::vector<T>& out,
ck::half_t atol = static_cast<ck::half_t>(1e-3f)) const std::vector<T>& ref,
const std::string& msg = "Error: Incorrect results!",
double rtol = 1e-3,
double atol = 1e-3)
{ {
if(out.size() != ref.size()) if(out.size() != ref.size())
{ {
...@@ -122,20 +127,20 @@ bool check_err(const std::vector<ck::half_t>& out, ...@@ -122,20 +127,20 @@ bool check_err(const std::vector<ck::half_t>& out,
bool res{true}; bool res{true};
int err_count = 0; int err_count = 0;
double err = 0; double err = 0;
double max_err = std::numeric_limits<ck::half_t>::min(); double max_err = std::numeric_limits<T>::min();
for(std::size_t i = 0; i < ref.size(); ++i) for(std::size_t i = 0; i < ref.size(); ++i)
{ {
double out_ = double(out[i]); double o = type_convert<float>(out[i]);
double ref_ = double(ref[i]); double r = type_convert<float>(ref[i]);
err = std::abs(out_ - ref_); err = std::abs(o - r);
if(err > atol + rtol * std::abs(ref_) || !std::isfinite(out_) || !std::isfinite(ref_)) if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
{ {
max_err = err > max_err ? err : max_err; max_err = err > max_err ? err : max_err;
err_count++; err_count++;
if(err_count < 5) if(err_count < 5)
{ {
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
<< i << "]: " << out_ << "!=" << ref_ << std::endl << i << "]: " << o << " != " << r << std::endl
<< msg << std::endl; << msg << std::endl;
} }
res = false; res = false;
...@@ -149,11 +154,10 @@ bool check_err(const std::vector<ck::half_t>& out, ...@@ -149,11 +154,10 @@ bool check_err(const std::vector<ck::half_t>& out,
} }
template <typename T> template <typename T>
typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, ck::bhalf_t>::value, typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bhalf_t>::value, bool>::type
bool>::type
check_err(const std::vector<T>& out, check_err(const std::vector<T>& out,
const std::vector<T>& ref, const std::vector<T>& ref,
const std::string& msg, const std::string& msg = "Error: Incorrect results!",
double = 0, double = 0,
double = 0) double = 0)
{ {
...@@ -178,7 +182,8 @@ check_err(const std::vector<T>& out, ...@@ -178,7 +182,8 @@ check_err(const std::vector<T>& out,
return true; return true;
} }
} // namespace test } // namespace utils
} // namespace ck
template <typename T> template <typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
......
#ifndef CONV_UTILS_HPP #ifndef CONV_FWD_UTIL_HPP
#define CONV_UTILS_HPP #define CONV_FWD_UTIL_HPP
#include <algorithm>
#include <cstdlib> #include <cstdlib>
#include <functional> #include <functional>
#include <iterator> #include <iterator>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <random>
#include <tuple>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "device.hpp"
#include "device_conv_fwd.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp" #include "tensor_layout.hpp"
namespace ck { namespace ck {
namespace conv_util { namespace utils {
namespace conv {
using DeviceConvFwdNoOpPtr =
ck::tensor_operation::device::DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
/** /**
* @brief Calculate number of FLOPs for Convolution * @brief Calculate number of FLOPs for Convolution
...@@ -28,7 +43,7 @@ namespace conv_util { ...@@ -28,7 +43,7 @@ namespace conv_util {
* *
* @return The number of flops. * @return The number of flops.
*/ */
std::size_t GetFlops(ck::index_t N, std::size_t get_flops(ck::index_t N,
ck::index_t C, ck::index_t C,
ck::index_t K, ck::index_t K,
const std::vector<ck::index_t>& filter_spatial_lengths, const std::vector<ck::index_t>& filter_spatial_lengths,
...@@ -66,7 +81,7 @@ std::size_t GetFlops(ck::index_t N, ...@@ -66,7 +81,7 @@ std::size_t GetFlops(ck::index_t N,
template <typename InDataType = float, template <typename InDataType = float,
typename WeiDataType = InDataType, typename WeiDataType = InDataType,
typename OutDataType = InDataType> typename OutDataType = InDataType>
std::size_t GetBtype(ck::index_t N, std::size_t get_btype(ck::index_t N,
ck::index_t C, ck::index_t C,
ck::index_t K, ck::index_t K,
const std::vector<ck::index_t>& input_spatial_lengths, const std::vector<ck::index_t>& input_spatial_lengths,
...@@ -108,27 +123,38 @@ struct ConvParams ...@@ -108,27 +123,38 @@ struct ConvParams
input_right_pads(2, 1) input_right_pads(2, 1)
{ {
} }
ConvParams(ck::index_t n_dim_spatial,
ck::index_t n, ConvParams(ck::index_t n_dim,
ck::index_t k, ck::index_t n_batch,
ck::index_t c, ck::index_t n_out_channels,
std::vector<ck::index_t> filter_lengths, ck::index_t n_in_channels,
std::vector<ck::index_t> input_lengths, const std::vector<ck::index_t>& filters_len,
std::vector<ck::index_t> conv_strides, const std::vector<ck::index_t>& input_len,
std::vector<ck::index_t> conv_dilations, const std::vector<ck::index_t>& strides,
std::vector<ck::index_t> left_pads, const std::vector<ck::index_t>& dilations,
std::vector<ck::index_t> right_pads) const std::vector<ck::index_t>& left_pads,
: num_dim_spatial(n_dim_spatial), const std::vector<ck::index_t>& right_pads)
N(n), : num_dim_spatial(n_dim),
K(k), N(n_batch),
C(c), K(n_out_channels),
filter_spatial_lengths(filter_lengths), C(n_in_channels),
input_spatial_lengths(input_lengths), filter_spatial_lengths(filters_len),
conv_filter_strides(conv_strides), input_spatial_lengths(input_len),
conv_filter_dilations(conv_dilations), conv_filter_strides(strides),
conv_filter_dilations(dilations),
input_left_pads(left_pads), input_left_pads(left_pads),
input_right_pads(right_pads) input_right_pads(right_pads)
{ {
if(filter_spatial_lengths.size() != num_dim_spatial ||
input_spatial_lengths.size() != num_dim_spatial ||
conv_filter_strides.size() != num_dim_spatial ||
conv_filter_dilations.size() != num_dim_spatial ||
input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
{
throw(std::runtime_error(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"));
}
} }
ck::index_t num_dim_spatial; ck::index_t num_dim_spatial;
...@@ -147,6 +173,17 @@ struct ConvParams ...@@ -147,6 +173,17 @@ struct ConvParams
std::vector<ck::index_t> GetOutputSpatialLengths() const std::vector<ck::index_t> GetOutputSpatialLengths() const
{ {
if(filter_spatial_lengths.size() != num_dim_spatial ||
input_spatial_lengths.size() != num_dim_spatial ||
conv_filter_strides.size() != num_dim_spatial ||
conv_filter_dilations.size() != num_dim_spatial ||
input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
{
throw(std::runtime_error(
"ConvParams::GetOutputSpatialLengths: "
"parameter size is different from number of declared dimensions!"));
}
std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0); std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0);
for(ck::index_t i = 0; i < num_dim_spatial; ++i) for(ck::index_t i = 0; i < num_dim_spatial; ++i)
{ {
...@@ -174,7 +211,7 @@ struct ConvParams ...@@ -174,7 +211,7 @@ struct ConvParams
* @return The host tensor descriptor object. * @return The host tensor descriptor object.
*/ */
template <typename TensorLayout> template <typename TensorLayout>
HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dims, HostTensorDescriptor get_host_tensor_descriptor(const std::vector<std::size_t>& dims,
const TensorLayout& layout) const TensorLayout& layout)
{ {
std::size_t C = dims[1]; std::size_t C = dims[1];
...@@ -228,7 +265,7 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim ...@@ -228,7 +265,7 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
return HostTensorDescriptor( return HostTensorDescriptor(
dims, dims,
std::vector<std::size_t>{ std::vector<std::size_t>{
C * dims[2] * dims[3] * dims[4], 1, dims[3] * dims[4] * C, dims[4] * C, C}); C * dims[2] * dims[3] * dims[4], 1, C * dims[3] * dims[4], C * dims[4], C});
} }
std::stringstream err_msg; std::stringstream err_msg;
...@@ -236,7 +273,282 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim ...@@ -236,7 +273,282 @@ HostTensorDescriptor GetHostTensorDescriptor(const std::vector<std::size_t>& dim
throw std::runtime_error(err_msg.str()); throw std::runtime_error(err_msg.str());
} }
} // namespace conv_util template <typename InDataType = float,
typename WeiDataType = float,
typename OutDataType = float,
typename InLayout = ck::tensor_layout::convolution::NHWC,
typename WeiLayout = ck::tensor_layout::convolution::KYXC,
typename OutLayout = ck::tensor_layout::convolution::NHWK>
auto get_host_tensors(const ConvParams& params, bool init = true)
{
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.C)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths),
std::end(params.input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
static_cast<std::size_t>(params.C)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths),
std::end(params.filter_spatial_lengths));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.K)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(ck::utils::conv::get_host_tensor_descriptor(input_dims, InLayout{}));
Tensor<WeiDataType> weights(
ck::utils::conv::get_host_tensor_descriptor(filter_dims, WeiLayout{}));
Tensor<OutDataType> host_output(
ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
Tensor<OutDataType> device_output(
ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{}));
if(init)
{
std::mt19937 gen(11939);
if constexpr(std::is_same<InDataType, uint8_t>::value)
{
std::uniform_int_distribution<> dis(-5, 5);
std::generate(
input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
std::generate(
weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
}
else
{
std::uniform_real_distribution<> dis(0.f, 1.f);
std::generate(
input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); });
std::generate(
weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); });
}
std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
std::fill(device_output.begin(), device_output.end(), OutDataType(0.f));
}
return std::make_tuple(input, weights, host_output, device_output);
}
HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWK{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWK{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWK{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KZYXC{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KYXC{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KXC{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
int num_dim_spatial = 2)
{
namespace tl = ck::tensor_layout::convolution;
switch(num_dim_spatial)
{
case 3: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{});
}
case 2: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{});
}
case 1: {
return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{});
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
template <ck::index_t NDim,
typename InDataType = float,
typename WeiDataType = float,
typename OutDataType = float>
void run_reference_convolution_forward(const ConvParams& params,
const Tensor<InDataType>& input,
const Tensor<WeiDataType>& weights,
Tensor<OutDataType>& output)
{
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
PassThrough,
PassThrough,
PassThrough,
NDim>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
output,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
PassThrough{},
PassThrough{},
PassThrough{});
ref_invoker.Run(ref_argument);
}
template <ck::index_t NDim,
typename InDataType = float,
typename WeiDataType = float,
typename OutDataType = float,
template <ck::index_t, typename, typename, typename>
class DeviceConvNDFwdInstance>
void run_convolution_forward(const ConvParams& params,
const Tensor<InDataType>& input,
const Tensor<WeiDataType>& weights,
Tensor<OutDataType>& output)
{
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
auto conv = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>();
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N,
params.K,
params.C,
params.input_spatial_lengths,
params.filter_spatial_lengths,
output_spatial_lengths,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
PassThrough{},
PassThrough{},
PassThrough{});
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"Error! device_conv with the specified compilation parameters does "
"not support this Conv problem");
}
invoker.Run(argument);
out_device_buf.FromDevice(output.mData.data());
}
template <ck::index_t NDim,
typename InDataType = float,
typename WeiDataType = float,
typename OutDataType = float>
bool run_convolution_forward_instances(const ConvParams& params,
const std::vector<DeviceConvFwdNoOpPtr>& conv_ptrs,
const Tensor<InDataType>& input,
const Tensor<WeiDataType>& weights,
Tensor<OutDataType>& output,
const Tensor<OutDataType>& host_output)
{
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
bool res{true};
for(auto& conv_ptr : conv_ptrs)
{
auto invoker = conv_ptr->MakeInvokerPointer();
auto argument = conv_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N,
params.K,
params.C,
params.input_spatial_lengths,
params.filter_spatial_lengths,
output_spatial_lengths,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
PassThrough{},
PassThrough{},
PassThrough{});
if(conv_ptr->IsSupportedArgument(argument.get()))
{
float atol{1e-5f};
float rtol{1e-4f};
if constexpr(std::is_same_v<InDataType, ck::half_t>)
{
atol = 1e-4f;
rtol = 2.5e-3f;
}
invoker->Run(argument.get());
out_device_buf.FromDevice(output.mData.data());
res = res &&
ck::utils::check_err(
output.mData, host_output.mData, "Error: incorrect results!", atol, rtol);
hipGetErrorString(
hipMemset(out_device_buf.GetDeviceBuffer(), 0, out_device_buf.mMemSize));
}
}
return res;
}
} // namespace conv
} // namespace utils
} // namespace ck } // namespace ck
#endif #endif
...@@ -65,21 +65,10 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream ...@@ -65,21 +65,10 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
} }
#if 1 #if 1
// FIXME: remove
float bf16_to_f32_(ck::bhalf_t src_val)
{
union
{
uint32_t int32;
float fp32;
} u = {uint32_t(src_val) << 16};
return u.fp32;
}
// FIXME: remove // FIXME: remove
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst) void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst)
{ {
for(int i = 0; i < src.mData.size(); ++i) for(int i = 0; i < src.mData.size(); ++i)
dst.mData[i] = bf16_to_f32_(src.mData[i]); dst.mData[i] = ck::type_convert<float>(src.mData[i]);
} }
#endif #endif
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
#include <half.hpp> #include <half.hpp>
#include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "print.hpp" #include "print.hpp"
...@@ -39,7 +41,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in, ...@@ -39,7 +41,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
const InRightPads&, const InRightPads&,
const ck::ActivTypeEnum_t activ_type) const ck::ActivTypeEnum activ_type)
{ {
using namespace ck; using namespace ck;
...@@ -117,7 +119,7 @@ int main(int argc, char* argv[]) ...@@ -117,7 +119,7 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]); const bool do_verification = std::stoi(argv[2]);
...@@ -167,7 +169,7 @@ int main(int argc, char* argv[]) ...@@ -167,7 +169,7 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]); const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]); const int nrepeat = std::stoi(argv[5]);
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
#if 0 #if 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
...@@ -401,7 +403,7 @@ int main(int argc, char* argv[]) ...@@ -401,7 +403,7 @@ int main(int argc, char* argv[])
make_tuple(in_right_pad_h, in_right_pad_w), make_tuple(in_right_pad_h, in_right_pad_w),
activ_type); activ_type);
check_error(add_host, add_device); ck::utils::check_err(add_device.mData, add_host.mData);
if(do_log) if(do_log)
{ {
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
#include <half.hpp> #include <half.hpp>
#include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "print.hpp" #include "print.hpp"
...@@ -473,7 +475,7 @@ int main(int argc, char* argv[]) ...@@ -473,7 +475,7 @@ int main(int argc, char* argv[])
make_tuple(in_right_pad_h, in_right_pad_w), make_tuple(in_right_pad_h, in_right_pad_w),
layout); layout);
check_error(in_host, in_device); ck::utils::check_err(in_device.mData, in_host.mData);
if(do_log) if(do_log)
{ {
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
#include <half.hpp> #include <half.hpp>
#include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "print.hpp" #include "print.hpp"
...@@ -534,7 +536,7 @@ int main(int argc, char* argv[]) ...@@ -534,7 +536,7 @@ int main(int argc, char* argv[])
make_tuple(in_right_pad_h, in_right_pad_w), make_tuple(in_right_pad_h, in_right_pad_w),
layout); layout);
check_error(out_host, out_device); ck::utils::check_err(out_device.mData, out_host.mData);
if(do_log) if(do_log)
{ {
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
#include <half.hpp> #include <half.hpp>
#include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "print.hpp" #include "print.hpp"
...@@ -37,7 +39,7 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in, ...@@ -37,7 +39,7 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
const InRightPads&, const InRightPads&,
const ck::ActivTypeEnum_t activ_type) const ck::ActivTypeEnum activ_type)
{ {
using namespace ck; using namespace ck;
...@@ -102,7 +104,7 @@ int main(int argc, char* argv[]) ...@@ -102,7 +104,7 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]); const bool do_verification = std::stoi(argv[2]);
...@@ -149,8 +151,8 @@ int main(int argc, char* argv[]) ...@@ -149,8 +151,8 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]); const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]); const int nrepeat = std::stoi(argv[5]);
// constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid; // constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::Sigmoid;
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
#if 0 #if 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
...@@ -377,7 +379,7 @@ int main(int argc, char* argv[]) ...@@ -377,7 +379,7 @@ int main(int argc, char* argv[])
make_tuple(in_right_pad_h, in_right_pad_w), make_tuple(in_right_pad_h, in_right_pad_w),
activ_type); activ_type);
check_error(out_host, out_device); ck::utils::check_err(out_device.mData, out_host.mData);
if(do_log) if(do_log)
{ {
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
#include <half.hpp> #include <half.hpp>
#include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "print.hpp" #include "print.hpp"
...@@ -38,7 +40,7 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in, ...@@ -38,7 +40,7 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
const InRightPads&, const InRightPads&,
const ck::ActivTypeEnum_t activ_type) const ck::ActivTypeEnum activ_type)
{ {
using namespace ck; using namespace ck;
...@@ -126,7 +128,7 @@ int main(int argc, char* argv[]) ...@@ -126,7 +128,7 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]); const bool do_verification = std::stoi(argv[2]);
...@@ -176,7 +178,7 @@ int main(int argc, char* argv[]) ...@@ -176,7 +178,7 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]); const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]); const int nrepeat = std::stoi(argv[5]);
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
#if 1 #if 1
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
...@@ -397,8 +399,8 @@ int main(int argc, char* argv[]) ...@@ -397,8 +399,8 @@ int main(int argc, char* argv[])
make_tuple(in_right_pad_h, in_right_pad_w), make_tuple(in_right_pad_h, in_right_pad_w),
activ_type); activ_type);
check_error(out_host, out_device); ck::utils::check_err(out_device.mData, out_host.mData);
check_error(max_host, max_device); ck::utils::check_err(max_device.mData, max_host.mData);
if(do_log) if(do_log)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment