// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_WARP_WARP_REDUCE_HPP_ #define ROCPRIM_WARP_WARP_REDUCE_HPP_ #include #include "../config.hpp" #include "../detail/various.hpp" #include "../intrinsics.hpp" #include "../functional.hpp" #include "../types.hpp" #include "detail/warp_reduce_crosslane.hpp" #include "detail/warp_reduce_shared_mem.hpp" /// \addtogroup warpmodule /// @{ BEGIN_ROCPRIM_NAMESPACE namespace detail { // Select warp_reduce implementation based WarpSize template struct select_warp_reduce_impl { typedef typename std::conditional< // can we use crosslane (DPP or shuffle-based) implementation? detail::is_warpsize_shuffleable::value, detail::warp_reduce_crosslane, // yes detail::warp_reduce_shared_mem // no >::type type; }; } // end namespace detail /// \brief The warp_reduce class is a warp level parallel primitive which provides methods /// for performing reduction operations on items partitioned across threads in a hardware /// warp. /// /// \tparam T - the input/output type. /// \tparam WarpSize - the size of logical warp size, which can be equal to or less than /// the size of hardware warp (see rocprim::device_warp_size()). Reduce operations are performed /// separately within groups determined by WarpSize. /// \tparam UseAllReduce - input parameter to determine whether to broadcast final reduction /// value to all threads (default is false). /// /// \par Overview /// * \p WarpSize must be equal to or less than the size of hardware warp (see /// rocprim::device_warp_size()). If it is less, reduce is performed separately within groups /// determined by WarpSize. \n /// For example, if \p WarpSize is 4, hardware warp is 64, reduction will be performed in logical /// warps grouped like this: `{ {0, 1, 2, 3}, {4, 5, 6, 7 }, ..., {60, 61, 62, 63} }` /// (thread is represented here by its id within hardware warp). /// * Logical warp is a group of \p WarpSize consecutive threads from the same hardware warp. /// * Supports non-commutative reduce operators. However, a reduce operator should be /// associative. When used with non-associative functions the results may be non-deterministic /// and/or vary in precision. /// * Number of threads executing warp_reduce's function must be a multiple of \p WarpSize; /// * All threads from a logical warp must be in the same hardware warp. /// /// \par Examples /// \parblock /// In the examples reduce operation is performed on groups of 16 threads, each provides /// one \p int value, result is returned using the same variable as for input. Hardware /// warp size is 64. Block (tile) size is 64. /// /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// // specialize warp_reduce for int and logical warp of 16 threads /// using warp_reduce_int = rocprim::warp_reduce; /// // allocate storage in shared memory /// __shared__ warp_reduce_int::storage_type temp[4]; /// /// int logical_warp_id = threadIdx.x/16; /// int value = ...; /// // execute reduce /// warp_reduce_int().reduce( /// value, // input /// value, // output /// temp[logical_warp_id] /// ); /// ... /// } /// \endcode /// \endparblock template< class T, unsigned int WarpSize = device_warp_size(), bool UseAllReduce = false > class warp_reduce #ifndef DOXYGEN_SHOULD_SKIP_THIS : private detail::select_warp_reduce_impl::type #endif { using base_type = typename detail::select_warp_reduce_impl::type; // Check if WarpSize is valid for the targets static_assert(WarpSize <= ROCPRIM_MAX_WARP_SIZE, "WarpSize can't be greater than hardware warp size."); public: /// \brief Struct used to allocate a temporary memory that is required for thread /// communication during operations provided by related parallel primitive. /// /// Depending on the implemention the operations exposed by parallel primitive may /// require a temporary storage for thread communication. The storage should be allocated /// using keywords __shared__. It can be aliased to /// an externally allocated memory, or be a part of a union type with other storage types /// to increase shared memory reusability. using storage_type = typename base_type::storage_type; /// \brief Performs reduction across threads in a logical warp. /// /// \tparam BinaryFunction - type of binary function used for reduce. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] output - reference to a thread output value. May be aliased with \p input. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] reduce_op - binary operation function object that will be used for reduce. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Examples /// \parblock /// In the examples reduce operation is performed on groups of 16 threads, each provides /// one \p int value, result is returned using the same variable as for input. Hardware /// warp size is 64. Block (tile) size is 64. /// /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// // specialize warp_reduce for int and logical warp of 16 threads /// using warp_reduce_int = rocprim::warp_reduce; /// // allocate storage in shared memory /// __shared__ warp_reduce_int::storage_type temp[4]; /// /// int logical_warp_id = threadIdx.x/16; /// int value = ...; /// // execute reduction /// warp_reduce_int().reduce( /// value, // input /// value, // output /// temp[logical_warp_id], /// rocprim::minimum() /// ); /// ... /// } /// \endcode /// \endparblock template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::reduce(input, output, storage, reduce_op); } /// \brief Performs reduction across threads in a logical warp. /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T , T& , storage_type& , BinaryFunction reduce_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) reduce_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort."); return; } /// \brief Performs reduction across threads in a logical warp. /// /// \tparam BinaryFunction - type of binary function used for reduce. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] output - reference to a thread output value. May be aliased with \p input. /// \param [in] valid_items - number of items that will be reduced in the warp. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] reduce_op - binary operation function object that will be used for reduce. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Examples /// \parblock /// In the examples reduce operation is performed on groups of 16 threads, each provides /// one \p int value, result is returned using the same variable as for input. Hardware /// warp size is 64. Block (tile) size is 64. /// /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// // specialize warp_reduce for int and logical warp of 16 threads /// using warp_reduce_int = rocprim::warp_reduce; /// // allocate storage in shared memory /// __shared__ warp_reduce_int::storage_type temp[4]; /// /// int logical_warp_id = threadIdx.x/16; /// int value = ...; /// int valid_items = 4; /// // execute reduction /// warp_reduce_int().reduce( /// value, // input /// value, // output /// valid_items, /// temp[logical_warp_id] /// ); /// ... /// } /// \endcode /// \endparblock template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T& output, int valid_items, storage_type& storage, BinaryFunction reduce_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::reduce(input, output, valid_items, storage, reduce_op); } /// \brief Performs reduction across threads in a logical warp. /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T , T& , int , storage_type& , BinaryFunction reduce_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) reduce_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort."); return; } /// \brief Performs head-segmented reduction across threads in a logical warp. /// /// \tparam Flag - type of head flags. Must be contextually convertible to \p bool. /// \tparam BinaryFunction - type of binary function used for reduce. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] output - reference to a thread output value. May be aliased with \p input. /// \param [in] flag - thread head flag, \p true flags mark beginnings of segments. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] reduce_op - binary operation function object that will be used for reduce. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto head_segmented_reduce(T input, T& output, Flag flag, storage_type& storage, BinaryFunction reduce_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::head_segmented_reduce(input, output, flag, storage, reduce_op); } /// \brief Performs head-segmented reduction across threads in a logical warp. /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto head_segmented_reduce(T , T& , Flag , storage_type& , BinaryFunction reduce_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) reduce_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort."); return; } /// \brief Performs tail-segmented reduction across threads in a logical warp. /// /// \tparam Flag - type of tail flags. Must be contextually convertible to \p bool. /// \tparam BinaryFunction - type of binary function used for reduce. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] output - reference to a thread output value. May be aliased with \p input. /// \param [in] flag - thread tail flag, \p true flags mark ends of segments. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] reduce_op - binary operation function object that will be used for reduce. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto tail_segmented_reduce(T input, T& output, Flag flag, storage_type& storage, BinaryFunction reduce_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::tail_segmented_reduce(input, output, flag, storage, reduce_op); } /// \brief Performs tail-segmented reduction across threads in a logical warp. /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto tail_segmented_reduce(T , T& , Flag , storage_type& , BinaryFunction reduce_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) reduce_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort."); return; } }; END_ROCPRIM_NAMESPACE /// @} // end of group warpmodule #endif // ROCPRIM_WARP_WARP_REDUCE_HPP_