Unverified Commit cd167e49 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Compile for gfx908 and gfx90a (#130)

* adding compilation for multiple targets

* fix build

* clean

* update Jekinsfile

* update readme

* update Jenkins

* use ck::half_t instead of ushort for bf16

* rename enum classes

* clean

* rename

* clean
parent ecf337ba
...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize, ...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
typename AKMGridDesc, typename AKMGridDesc,
typename BKNGridDesc, typename BKNGridDesc,
typename CMNGridDesc, typename CMNGridDesc,
......
...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize, ...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
typename AK0MK1GridDesc, typename AK0MK1GridDesc,
typename BK0NK1GridDesc, typename BK0NK1GridDesc,
typename CMNGridDesc, typename CMNGridDesc,
......
...@@ -11,7 +11,7 @@ template <ck::index_t BlockSize, ...@@ -11,7 +11,7 @@ template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
typename AGridDesc_K0_M_K1, typename AGridDesc_K0_M_K1,
typename BGridDesc_K0_N_K, typename BGridDesc_K0_N_K,
typename CMNGridDesc, typename CMNGridDesc,
......
...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize, ...@@ -10,7 +10,7 @@ template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
ck::InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, ck::InMemoryDataOperationEnum CGlobalMemoryDataOperation,
typename ABK0MK1GridDesc, typename ABK0MK1GridDesc,
typename BBK0NK1GridDesc, typename BBK0NK1GridDesc,
typename CMNGridDesc, typename CMNGridDesc,
......
...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_blockwise = std::tuple< ...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
using deviceReduceBlockWisePtrType = DeviceReducePtr< using deviceReduceBlockWisePtrType = DeviceReducePtr<
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation, typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>; typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
...@@ -57,9 +57,9 @@ template <typename InDataType, ...@@ -57,9 +57,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_blockwise( void add_device_reduce_instance_blockwise(
std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances) std::vector<deviceReduceBlockWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
{ {
...@@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise( ...@@ -71,11 +71,11 @@ void add_device_reduce_instance_blockwise(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) { static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
using cfg1 = using cfg1 =
...@@ -123,15 +123,15 @@ void add_device_reduce_instance_blockwise( ...@@ -123,15 +123,15 @@ void add_device_reduce_instance_blockwise(
IndicesOpt>( \ IndicesOpt>( \
std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances) std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_BLOCKWISE_INST_BY_ID( \ #define ADD_BLOCKWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_BY_TYPE(inT, \ ADD_BLOCKWISE_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \ #define ADD_BLOCKWISE_INST_REF_BY_TYPE( \
...@@ -150,15 +150,15 @@ void add_device_reduce_instance_blockwise( ...@@ -150,15 +150,15 @@ void add_device_reduce_instance_blockwise(
AccElementwiseOperation>> & \ AccElementwiseOperation>> & \
device_op_instances) device_op_instances)
#define ADD_BLOCKWISE_INST_REF_BY_ID( \ #define ADD_BLOCKWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple< ...@@ -34,7 +34,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr< using deviceReduceBlockWiseSecondCallPtrType = DeviceReducePtr<
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::InElementwiseOperation, typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::InElementwiseOperation,
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::AccElementwiseOperation>; typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::AccElementwiseOperation>;
...@@ -44,9 +44,9 @@ template <typename InDataType, ...@@ -44,9 +44,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_blockwise_second_call( void add_device_reduce_instance_blockwise_second_call(
std::vector<deviceReduceBlockWiseSecondCallPtrType<AccDataType, ReduceOpId>>& std::vector<deviceReduceBlockWiseSecondCallPtrType<AccDataType, ReduceOpId>>&
device_op_instances) device_op_instances)
...@@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call( ...@@ -60,11 +60,11 @@ void add_device_reduce_instance_blockwise_second_call(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
static_assert(std::is_same<InDataType, AccDataType>::value, static_assert(std::is_same<InDataType, AccDataType>::value,
"InDataType and AccDataType should be the same to use " "InDataType and AccDataType should be the same to use "
...@@ -117,15 +117,15 @@ void add_device_reduce_instance_blockwise_second_call( ...@@ -117,15 +117,15 @@ void add_device_reduce_instance_blockwise_second_call(
std::vector<deviceReduceBlockWiseSecondCallPtrType<compT, ReduceOpId>> & \ std::vector<deviceReduceBlockWiseSecondCallPtrType<compT, ReduceOpId>> & \
device_op_instances) device_op_instances)
#define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \ #define ADD_BLOCKWISE_SECOND_CALL_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \ ADD_BLOCKWISE_SECOND_CALL_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \ #define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE( \
...@@ -145,15 +145,15 @@ void add_device_reduce_instance_blockwise_second_call( ...@@ -145,15 +145,15 @@ void add_device_reduce_instance_blockwise_second_call(
AccElementwiseOperation>> & \ AccElementwiseOperation>> & \
device_op_instances) device_op_instances)
#define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \ #define ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \ ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< ...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOperation> template <typename AccDataType, ReduceTensorOp ReduceOperation>
using deviceReduceMultiBlockAtomicAddPtrType = using deviceReduceMultiBlockAtomicAddPtrType =
DeviceReducePtr<typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>:: DeviceReducePtr<typename reduce_unary_operator<AccDataType, ReduceOperation, true, true>::
InElementwiseOperation, InElementwiseOperation,
...@@ -59,9 +59,9 @@ template <typename InDataType, ...@@ -59,9 +59,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_multiblock_atomic_add( void add_device_reduce_instance_multiblock_atomic_add(
std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>& std::vector<deviceReduceMultiBlockAtomicAddPtrType<AccDataType, ReduceOpId>>&
device_op_instances) device_op_instances)
...@@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add( ...@@ -74,18 +74,18 @@ void add_device_reduce_instance_multiblock_atomic_add(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
static_assert(IndicesOpt == ReduceTensorIndices_t::NO_INDICES, static_assert(IndicesOpt == ReduceTensorIndices::NO_INDICES,
"AtomicAdd can only be used with reduction operations without indices!"); "AtomicAdd can only be used with reduction operations without indices!");
constexpr bool op_acceptable = constexpr bool op_acceptable =
(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::MUL || (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL ||
ReduceOpId == ReduceTensorOp_t::AVG || ReduceOpId == ReduceTensorOp_t::NORM1); ReduceOpId == ReduceTensorOp::AVG || ReduceOpId == ReduceTensorOp::NORM1);
constexpr bool out_type_acceptable = constexpr bool out_type_acceptable =
(std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value); (std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value);
...@@ -144,15 +144,15 @@ void add_device_reduce_instance_multiblock_atomic_add( ...@@ -144,15 +144,15 @@ void add_device_reduce_instance_multiblock_atomic_add(
std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \ std::vector<deviceReduceMultiBlockAtomicAddPtrType<compT, ReduceOpId>> & \
device_op_instances) device_op_instances)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \ #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \ ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \ #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \
...@@ -171,15 +171,15 @@ void add_device_reduce_instance_multiblock_atomic_add( ...@@ -171,15 +171,15 @@ void add_device_reduce_instance_multiblock_atomic_add(
AccElementwiseOperation>> & \ AccElementwiseOperation>> & \
device_op_instances) device_op_instances)
#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \ #define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \ ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple< ...@@ -46,7 +46,7 @@ using reduce_configuration_2_instances_multiblock_partial_reduce = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr< using deviceReduceMultiBlockPartialReducePtrType = DeviceReducePtr<
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::InElementwiseOperation, typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::InElementwiseOperation,
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::AccElementwiseOperation>; typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::AccElementwiseOperation>;
...@@ -56,9 +56,9 @@ template <typename InDataType, ...@@ -56,9 +56,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_multiblock_partial_reduce( void add_device_reduce_instance_multiblock_partial_reduce(
std::vector<deviceReduceMultiBlockPartialReducePtrType<AccDataType, ReduceOpId>>& std::vector<deviceReduceMultiBlockPartialReducePtrType<AccDataType, ReduceOpId>>&
device_op_instances) device_op_instances)
...@@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce( ...@@ -72,11 +72,11 @@ void add_device_reduce_instance_multiblock_partial_reduce(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) { static_for<0, std::tuple_size<reduce_configuration_1_instances>::value, 1>{}([&](auto i) {
using cfg1 = using cfg1 =
...@@ -126,15 +126,15 @@ void add_device_reduce_instance_multiblock_partial_reduce( ...@@ -126,15 +126,15 @@ void add_device_reduce_instance_multiblock_partial_reduce(
std::vector<deviceReduceMultiBlockPartialReducePtrType<compT, ReduceOpId>> & \ std::vector<deviceReduceMultiBlockPartialReducePtrType<compT, ReduceOpId>> & \
device_op_instances) device_op_instances)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \ #define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \ ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \ #define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE( \
...@@ -154,15 +154,15 @@ void add_device_reduce_instance_multiblock_partial_reduce( ...@@ -154,15 +154,15 @@ void add_device_reduce_instance_multiblock_partial_reduce(
AccElementwiseOperation>> & \ AccElementwiseOperation>> & \
device_op_instances) device_op_instances)
#define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \ #define ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \ ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple< ...@@ -47,7 +47,7 @@ using reduce_configuration_2_instances_threadwise = std::tuple<
>; >;
#endif #endif
template <typename AccDataType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp ReduceOpId>
using deviceReduceThreadWisePtrType = DeviceReducePtr< using deviceReduceThreadWisePtrType = DeviceReducePtr<
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation, typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation,
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>; typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation>;
...@@ -57,9 +57,9 @@ template <typename InDataType, ...@@ -57,9 +57,9 @@ template <typename InDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp ReduceOpId,
NanPropagation_t NanOpt, NanPropagation NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices IndicesOpt>
void add_device_reduce_instance_threadwise( void add_device_reduce_instance_threadwise(
std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances) std::vector<deviceReduceThreadWisePtrType<AccDataType, ReduceOpId>>& device_op_instances)
{ {
...@@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise( ...@@ -71,11 +71,11 @@ void add_device_reduce_instance_threadwise(
AccElementwiseOperation; AccElementwiseOperation;
constexpr bool Indexable = constexpr bool Indexable =
(ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp_t::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
using cfg1 = ReductionConfiguration_1<256, 256, 1>; using cfg1 = ReductionConfiguration_1<256, 256, 1>;
...@@ -119,15 +119,15 @@ void add_device_reduce_instance_threadwise( ...@@ -119,15 +119,15 @@ void add_device_reduce_instance_threadwise(
IndicesOpt>( \ IndicesOpt>( \
std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances) std::vector<deviceReduceThreadWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_THREADWISE_INST_BY_ID( \ #define ADD_THREADWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_BY_TYPE(inT, \ ADD_THREADWISE_INST_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
#define ADD_THREADWISE_INST_REF_BY_TYPE( \ #define ADD_THREADWISE_INST_REF_BY_TYPE( \
...@@ -146,15 +146,15 @@ void add_device_reduce_instance_threadwise( ...@@ -146,15 +146,15 @@ void add_device_reduce_instance_threadwise(
AccElementwiseOperation>> & \ AccElementwiseOperation>> & \
device_op_instances) device_op_instances)
#define ADD_THREADWISE_INST_REF_BY_ID( \ #define ADD_THREADWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ ADD_THREADWISE_INST_REF_BY_TYPE(inT, \
compT, \ compT, \
outT, \ outT, \
static_cast<ReduceTensorOp_t>(ReduceOpId), \ static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<NanPropagation_t>(NanOpt), \ static_cast<NanPropagation>(NanOpt), \
static_cast<ReduceTensorIndices_t>(IndicesOpt), \ static_cast<ReduceTensorIndices>(IndicesOpt), \
Rank, \ Rank, \
NumReduceDim) NumReduceDim)
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -39,7 +39,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in, ...@@ -39,7 +39,7 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
const InRightPads&, const InRightPads&,
const ck::ActivTypeEnum_t activ_type) const ck::ActivTypeEnum activ_type)
{ {
using namespace ck; using namespace ck;
...@@ -117,7 +117,7 @@ int main(int argc, char* argv[]) ...@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]); const bool do_verification = std::stoi(argv[2]);
...@@ -167,7 +167,7 @@ int main(int argc, char* argv[]) ...@@ -167,7 +167,7 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]); const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]); const int nrepeat = std::stoi(argv[5]);
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
#if 0 #if 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
......
...@@ -37,7 +37,7 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in, ...@@ -37,7 +37,7 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
const InRightPads&, const InRightPads&,
const ck::ActivTypeEnum_t activ_type) const ck::ActivTypeEnum activ_type)
{ {
using namespace ck; using namespace ck;
...@@ -102,7 +102,7 @@ int main(int argc, char* argv[]) ...@@ -102,7 +102,7 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]); const bool do_verification = std::stoi(argv[2]);
...@@ -149,8 +149,8 @@ int main(int argc, char* argv[]) ...@@ -149,8 +149,8 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]); const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]); const int nrepeat = std::stoi(argv[5]);
// constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid; // constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::Sigmoid;
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
#if 0 #if 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
......
...@@ -38,7 +38,7 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in, ...@@ -38,7 +38,7 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
const InRightPads&, const InRightPads&,
const ck::ActivTypeEnum_t activ_type) const ck::ActivTypeEnum activ_type)
{ {
using namespace ck; using namespace ck;
...@@ -126,7 +126,7 @@ int main(int argc, char* argv[]) ...@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[1]));
const bool do_verification = std::stoi(argv[2]); const bool do_verification = std::stoi(argv[2]);
...@@ -176,18 +176,18 @@ int main(int argc, char* argv[]) ...@@ -176,18 +176,18 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]); const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]); const int nrepeat = std::stoi(argv[5]);
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu; constexpr ck::ActivTypeEnum activ_type = ActivTypeEnum::LeakyRelu;
#if 1 #if 1
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{}; constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{}; constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{}; constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{}; constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{}; constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{}; constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{}; constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
#elif 0 #elif 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{}; constexpr auto Hi = Number<1080>{};
......
...@@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; ...@@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum;
using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// c[g, m, n] = a[g, m, k] * b[g, n, k] // c[g, m, n] = a[g, m, k] * b[g, n, k]
// d0[g, m] = reduce0(c[g, m, n]) // d0[g, m] = reduce0(c[g, m, n])
......
...@@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; ...@@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum;
using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// c[g, m, n] = a[g, m, k] * b[g, n, k] // c[g, m, n] = a[g, m, k] * b[g, n, k]
// d0[g, m] = reduce0(c[g, m, n]) // d0[g, m] = reduce0(c[g, m, n])
......
...@@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; ...@@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum;
using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// c[g, m, n] = a[g, m, k] * b[g, n, k] // c[g, m, n] = a[g, m, k] * b[g, n, k]
// d0[g, m] = reduce0(c[g, m, n]) // d0[g, m] = reduce0(c[g, m, n])
......
...@@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; ...@@ -23,7 +23,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ReduceSum = ck::tensor_operation::element_wise::ReduceSum; using ReduceSum = ck::tensor_operation::element_wise::ReduceSum;
using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum; using ReduceSquareSum = ck::tensor_operation::element_wise::ReduceSquareSum;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// c[g, m, n] = a[g, m, k] * b[g, n, k] // c[g, m, n] = a[g, m, k] * b[g, n, k]
// d0[g, m] = reduce0(c[g, m, n]) // d0[g, m] = reduce0(c[g, m, n])
......
...@@ -18,13 +18,13 @@ using S = ck::Sequence<Is...>; ...@@ -18,13 +18,13 @@ using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto ConvFwd1x1P0 = static constexpr auto ConvFwd1x1P0 =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
static constexpr auto ConvFwd1x1S1P0 = static constexpr auto ConvFwd1x1S1P0 =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
using device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances = std::tuple< using device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances = std::tuple<
......
...@@ -18,13 +18,13 @@ using S = ck::Sequence<Is...>; ...@@ -18,13 +18,13 @@ using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto ConvFwd1x1P0 = static constexpr auto ConvFwd1x1P0 =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
static constexpr auto ConvFwd1x1S1P0 = static constexpr auto ConvFwd1x1S1P0 =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
using device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances = std::tuple< using device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances = std::tuple<
......
...@@ -17,13 +17,13 @@ using S = ck::Sequence<Is...>; ...@@ -17,13 +17,13 @@ using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto ConvFwd1x1P0 = static constexpr auto ConvFwd1x1P0 =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
static constexpr auto ConvFwd1x1S1P0 = static constexpr auto ConvFwd1x1S1P0 =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Conv1D // Conv1D
......
...@@ -17,13 +17,13 @@ using S = ck::Sequence<Is...>; ...@@ -17,13 +17,13 @@ using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault = static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto ConvFwd1x1P0 = static constexpr auto ConvFwd1x1P0 =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0; ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
static constexpr auto ConvFwd1x1S1P0 = static constexpr auto ConvFwd1x1S1P0 =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0; ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k] // Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
using device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances = using device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances =
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment