#ifndef DEVICE_REDUCE_INSTANCE_THREADWISE_HPP #define DEVICE_REDUCE_INSTANCE_THREADWISE_HPP #include "reduction_operator_mapping.hpp" #include "device_reduce_instance_impl_common.hpp" #include "device_reduce_threadwise.hpp" namespace ck { namespace tensor_operation { namespace device { namespace device_reduce_instance { #ifdef QUICK_REDUCE_TEST using reduce_configuration_2_instances_threadwise = std::tuple< // clang-format off // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize ReductionConfiguration_2<0, 2, 2, 2, 1>, ReductionConfiguration_2<0, 1, 1, 2, 1>, ReductionConfiguration_2<1, 2, 1, 1, 2>, ReductionConfiguration_2<1, 2, 2, 1, 2>, ReductionConfiguration_2<0, 1, 1, 3, 1>, ReductionConfiguration_2<1, 1, 1, 1, 3> // clang-format on >; #else using reduce_configuration_2_instances_threadwise = std::tuple< // clang-format off // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize ReductionConfiguration_2<0, 4, 4, 8, 1>, ReductionConfiguration_2<0, 4, 4, 4, 1>, ReductionConfiguration_2<0, 2, 2, 2, 1>, ReductionConfiguration_2<1, 4, 1, 1, 8>, ReductionConfiguration_2<1, 4, 1, 1, 4>, ReductionConfiguration_2<1, 2, 1, 1, 2>, // special instances ReductionConfiguration_2<0, 1, 1, 3, 1>, ReductionConfiguration_2<0, 1, 1, 5, 1>, ReductionConfiguration_2<0, 1, 1, 7, 1>, ReductionConfiguration_2<0, 1, 1, 11, 1>, ReductionConfiguration_2<1, 1, 1, 1, 3>, ReductionConfiguration_2<1, 1, 1, 1, 5>, ReductionConfiguration_2<1, 1, 1, 1, 7>, ReductionConfiguration_2<1, 1, 1, 1, 11> // clang-format on >; #endif template using deviceReduceThreadWisePtrType = DeviceReducePtr< typename reduce_unary_operator::InElementwiseOperation, typename reduce_unary_operator::AccElementwiseOperation>; template void add_device_reduce_instance_threadwise( std::vector>& device_op_instances) { using ReduceOperation = typename reduce_binary_operator::opType; using InElementwiseOperation = typename reduce_unary_operator::InElementwiseOperation; using AccElementwiseOperation = typename reduce_unary_operator:: AccElementwiseOperation; constexpr bool Indexable = (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX); constexpr bool NeedIndices = Indexable && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES); constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; using cfg1 = ReductionConfiguration_1<256, 256, 1>; static_for<0, std::tuple_size::value, 1>{}( [&](auto j) { using cfg2 = remove_cvref_t(reduce_configuration_2_instances_threadwise{}))>; using ReduceOpInstance = DeviceReduceThreadWise; device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); }); }; #define ADD_THREADWISE_INST_BY_TYPE(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ template void add_device_reduce_instance_threadwise, \ ReduceOpId, \ NanOpt, \ IndicesOpt>( \ std::vector> & device_op_instances) #define ADD_THREADWISE_INST_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ ADD_THREADWISE_INST_BY_TYPE(inT, \ compT, \ outT, \ static_cast(ReduceOpId), \ static_cast(NanOpt), \ static_cast(IndicesOpt), \ Rank, \ __VA_ARGS__) #define ADD_THREADWISE_INST_REF_BY_TYPE( \ inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ extern template void add_device_reduce_instance_threadwise, \ ReduceOpId, \ NanOpt, \ IndicesOpt>( \ std::vector::InElementwiseOperation, \ typename reduce_unary_operator:: \ AccElementwiseOperation>> & \ device_op_instances) #define ADD_THREADWISE_INST_REF_BY_ID(inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, ...) \ ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ compT, \ outT, \ static_cast(ReduceOpId), \ static_cast(NanOpt), \ static_cast(IndicesOpt), \ Rank, \ __VA_ARGS__) } // namespace device_reduce_instance } // namespace device } // namespace tensor_operation } // namespace ck #endif