// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_WARP_WARP_SCAN_HPP_ #define ROCPRIM_WARP_WARP_SCAN_HPP_ #include #include "../config.hpp" #include "../detail/various.hpp" #include "../intrinsics.hpp" #include "../functional.hpp" #include "../types.hpp" #include "detail/warp_scan_crosslane.hpp" #include "detail/warp_scan_shared_mem.hpp" /// \addtogroup warpmodule /// @{ BEGIN_ROCPRIM_NAMESPACE namespace detail { // Select warp_scan implementation based WarpSize template struct select_warp_scan_impl { typedef typename std::conditional< // can we use crosslane (DPP or shuffle-based) implementation? detail::is_warpsize_shuffleable::value, detail::warp_scan_crosslane, // yes detail::warp_scan_shared_mem // no >::type type; }; } // end namespace detail /// \brief The warp_scan class is a warp level parallel primitive which provides methods /// for performing inclusive and exclusive scan operations of items partitioned across /// threads in a hardware warp. /// /// \tparam T - the input/output type. /// \tparam WarpSize - the size of logical warp size, which can be equal to or less than /// the size of hardware warp (see rocprim::device_warp_size()). Scan operations are performed /// separately within groups determined by WarpSize. /// /// \par Overview /// * \p WarpSize must be equal to or less than the size of hardware warp (see /// rocprim::device_warp_size()). If it is less, scan is performed separately within groups /// determined by WarpSize. \n /// For example, if \p WarpSize is 4, hardware warp is 64, scan will be performed in logical /// warps grouped like this: `{ {0, 1, 2, 3}, {4, 5, 6, 7 }, ..., {60, 61, 62, 63} }` /// (thread is represented here by its id within hardware warp). /// * Logical warp is a group of \p WarpSize consecutive threads from the same hardware warp. /// * Supports non-commutative scan operators. However, a scan operator should be /// associative. When used with non-associative functions the results may be non-deterministic /// and/or vary in precision. /// * Number of threads executing warp_scan's function must be a multiple of \p WarpSize; /// * All threads from a logical warp must be in the same hardware warp. /// /// \par Examples /// \parblock /// In the examples scan operation is performed on groups of 16 threads, each provides /// one \p int value, result is returned using the same variable as for input. Hardware /// warp size is 64. Block (tile) size is 64. /// /// \code{.cpp} /// __global__ void example_kernel(...) /// { /// // specialize warp_scan for int and logical warp of 16 threads /// using warp_scan_int = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_int::storage_type temp[4]; /// /// int logical_warp_id = threadIdx.x/16; /// int value = ...; /// // execute inclusive scan /// warp_scan_int().inclusive_scan( /// value, // input /// value, // output /// temp[logical_warp_id] /// ); /// ... /// } /// \endcode /// \endparblock template< class T, unsigned int WarpSize = device_warp_size() > class warp_scan #ifndef DOXYGEN_SHOULD_SKIP_THIS : private detail::select_warp_scan_impl::type #endif { using base_type = typename detail::select_warp_scan_impl::type; // Check if WarpSize is valid for the targets static_assert(WarpSize <= ROCPRIM_MAX_WARP_SIZE, "WarpSize can't be greater than hardware warp size."); public: /// \brief Struct used to allocate a temporary memory that is required for thread /// communication during operations provided by related parallel primitive. /// /// Depending on the implemention the operations exposed by parallel primitive may /// require a temporary storage for thread communication. The storage should be allocated /// using keywords __shared__. It can be aliased to /// an externally allocated memory, or be a part of a union type with other storage types /// to increase shared memory reusability. using storage_type = typename base_type::storage_type; /// \brief Performs inclusive scan across threads in a logical warp. /// /// \tparam BinaryFunction - type of binary function used for scan. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] output - reference to a thread output value. May be aliased with \p input. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] scan_op - binary operation function object that will be used for scan. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Examples /// \parblock /// The examples present inclusive min scan operations performed on groups of 32 threads, /// each provides one \p float value, result is returned using the same variable as for input. /// Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for float and logical warp of 32 threads /// using warp_scan_f = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8 /// /// int logical_warp_id = threadIdx.x/32; /// float value = ...; /// // execute inclusive min scan /// warp_scan_float().inclusive_scan( /// value, // input /// value, // output /// temp[logical_warp_id], /// rocprim::minimum() /// ); /// ... /// } /// \endcode /// /// If the input values across threads in a block/tile are {1, -2, 3, -4, ..., 255, -256}, then /// output values in the first logical warp will be {1, -2, -2, -4, ..., -32}, in the second: /// {33, -34, -34, -36, ..., -64} etc. /// \endparblock template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T input, T& output, storage_type& storage, BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::inclusive_scan(input, output, storage, scan_op); } /// \brief Performs inclusive scan across threads in a logical warp. /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T , T& , storage_type& , BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) scan_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort."); return; } /// \brief Performs inclusive scan and reduction across threads in a logical warp. /// /// \tparam BinaryFunction - type of binary function used for scan. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] output - reference to a thread output value. May be aliased with \p input. /// \param [out] reduction - result of reducing of all \p input values in logical warp. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] scan_op - binary operation function object that will be used for scan. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Examples /// \parblock /// The examples present inclusive prefix sum operations performed on groups of 64 threads, /// each thread provides one \p int value. Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for int and logical warp of 64 threads /// using warp_scan_int = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4 /// /// int logical_warp_id = threadIdx.x/64; /// int input = ...; /// int output, reduction; /// // inclusive prefix sum /// warp_scan_int().inclusive_scan( /// input, /// output, /// reduction, /// temp[logical_warp_id] /// ); /// ... /// } /// \endcode /// /// If the \p input values across threads in a block/tile are {1, 1, 1, 1, ..., 1, 1}, then /// \p output values in the every logical warp will be {1, 2, 3, 4, ..., 64}. /// The \p reduction will be equal \p 64. /// \endparblock template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T input, T& output, T& reduction, storage_type& storage, BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::inclusive_scan(input, output, reduction, storage, scan_op); } /// \brief Performs inclusive scan and reduction across threads in a logical warp. /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T , T& , T& , storage_type& , BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) scan_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort."); return; } /// \brief Performs exclusive scan across threads in a logical warp. /// /// \tparam BinaryFunction - type of binary function used for scan. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] output - reference to a thread output value. May be aliased with \p input. /// \param [in] init - initial value used to start the exclusive scan. Should be the same /// for all threads in a logical warp. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] scan_op - binary operation function object that will be used for scan. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Examples /// \parblock /// The examples present exclusive min scan operations performed on groups of 32 threads, /// each provides one \p float value, result is returned using the same variable as for input. /// Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for float and logical warp of 32 threads /// using warp_scan_f = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8 /// /// int logical_warp_id = threadIdx.x/32; /// float value = ...; /// // execute exclusive min scan /// warp_scan_float().exclusive_scan( /// value, // input /// value, // output /// 100.0f, // init /// temp[logical_warp_id], /// rocprim::minimum() /// ); /// ... /// } /// \endcode /// /// If the initial value is \p 100 and input values across threads in a block/tile are /// {1, -2, 3, -4, ..., 255, -256}, then output values in the first logical /// warp will be {100, 1, -2, -2, -4, ..., -30}, in the second: /// {100, 33, -34, -34, -36, ..., -62} etc. /// \endparblock template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T input, T& output, T init, storage_type& storage, BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::exclusive_scan(input, output, init, storage, scan_op); } /// \brief Performs exclusive scan across threads in a logical warp. /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T , T& , T , storage_type& , BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) scan_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort."); return; } /// \brief Performs exclusive scan and reduction across threads in a logical warp. /// /// \tparam BinaryFunction - type of binary function used for scan. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] output - reference to a thread output value. May be aliased with \p input. /// \param [in] init - initial value used to start the exclusive scan. Should be the same /// for all threads in a logical warp. /// \param [out] reduction - result of reducing of all \p input values in logical warp. /// \p init value is not included in the reduction. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] scan_op - binary operation function object that will be used for scan. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Examples /// \parblock /// The examples present exclusive prefix sum operations performed on groups of 64 threads, /// each thread provides one \p int value. Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for int and logical warp of 64 threads /// using warp_scan_int = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4 /// /// int logical_warp_id = threadIdx.x/64; /// int input = ...; /// int output, reduction; /// // exclusive prefix sum /// warp_scan_int().exclusive_scan( /// input, /// output, /// 10, // init /// reduction, /// temp[logical_warp_id] /// ); /// ... /// } /// \endcode /// /// If the initial value is \p 10 and \p input values across threads in a block/tile are /// {1, 1, ..., 1, 1}, then \p output values in every logical warp will be /// {10, 11, 12, 13, ..., 73}. The \p reduction will be 64. /// \endparblock template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T input, T& output, T init, T& reduction, storage_type& storage, BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::exclusive_scan(input, output, init, reduction, storage, scan_op); } /// \brief Performs exclusive scan and reduction across threads in a logical warp. /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T , T& , T , T& , storage_type& , BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) scan_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort."); return; } /// \brief Performs inclusive and exclusive scan operations across threads /// in a logical warp. /// /// \tparam BinaryFunction - type of binary function used for scan. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] inclusive_output - reference to a thread inclusive-scan output value. /// \param [out] exclusive_output - reference to a thread exclusive-scan output value. /// \param [in] init - initial value used to start the exclusive scan. Should be the same /// for all threads in a logical warp. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] scan_op - binary operation function object that will be used for scan. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Examples /// \parblock /// The examples present min inclusive and exclusive scan operations performed on groups of 32 threads, /// each provides one \p float value, result is returned using the same variable as for input. /// Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for float and logical warp of 32 threads /// using warp_scan_f = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8 /// /// int logical_warp_id = threadIdx.x/32; /// float input = ...; /// float ex_output, in_output; /// // execute exclusive min scan /// warp_scan_float().scan( /// input, /// in_output, /// ex_output, /// 100.0f, // init /// temp[logical_warp_id], /// rocprim::minimum() /// ); /// ... /// } /// \endcode /// /// If the initial value is \p 100 and input values across threads in a block/tile are /// {1, -2, 3, -4, ..., 255, -256}, then \p in_output values in the first logical /// warp will be {1, -2, -2, -4, ..., -32}, in the second: /// {33, -34, -34, -36, ..., -64} and so forth, \p ex_output values in the first /// logical warp will be {100, 1, -2, -2, -4, ..., -30}, in the second: /// {100, 33, -34, -34, -36, ..., -62} etc. /// \endparblock template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T input, T& inclusive_output, T& exclusive_output, T init, storage_type& storage, BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::scan(input, inclusive_output, exclusive_output, init, storage, scan_op); } /// \brief Performs inclusive and exclusive scan operations across threads /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T , T& , T& , T , storage_type& , BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) scan_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort."); return; } /// \brief Performs inclusive and exclusive scan operations, and reduction across /// threads in a logical warp. /// /// \tparam BinaryFunction - type of binary function used for scan. Default type /// is rocprim::plus. /// /// \param [in] input - thread input value. /// \param [out] inclusive_output - reference to a thread inclusive-scan output value. /// \param [out] exclusive_output - reference to a thread exclusive-scan output value. /// \param [in] init - initial value used to start the exclusive scan. Should be the same /// for all threads in a logical warp. /// \param [out] reduction - result of reducing of all \p input values in logical warp. /// \p init value is not included in the reduction. /// \param [in] storage - reference to a temporary storage object of type storage_type. /// \param [in] scan_op - binary operation function object that will be used for scan. /// The signature of the function should be equivalent to the following: /// T f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). /// /// \par Examples /// \parblock /// The examples present inclusive and exclusive prefix sum operations performed on groups /// of 64 threads, each thread provides one \p int value. Hardware warp size is 64. /// Block (tile) size is 256. /// /// \code{.cpp} /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for int and logical warp of 64 threads /// using warp_scan_int = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4 /// /// int logical_warp_id = threadIdx.x/64; /// int input = ...; /// int in_output, ex_output, reduction; /// // inclusive and exclusive prefix sum /// warp_scan_int().scan( /// input, /// in_output, /// ex_output, /// init, /// reduction, /// temp[logical_warp_id] /// ); /// ... /// } /// \endcode /// /// If the initial value is \p 10 and \p input values across threads in a block/tile are /// {1, 1, ..., 1, 1}, then \p in_output values in every logical warp will be /// {1, 2, 3, 4, ..., 63, 64}, and \p ex_output values in every logical warp will /// be {10, 11, 12, 13, ..., 73}. The \p reduction will be 64. /// \endparblock template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T input, T& inclusive_output, T& exclusive_output, T init, T& reduction, storage_type& storage, BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { base_type::scan( input, inclusive_output, exclusive_output, init, reduction, storage, scan_op ); } /// \brief Performs inclusive and exclusive scan operations across threads /// Invalid Warp Size template, unsigned int FunctionWarpSize = WarpSize> ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T , T& , T& , T , T& , storage_type& , BinaryFunction scan_op = BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { (void) scan_op; ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort."); return; } /// \brief Broadcasts value from one thread to all threads in logical warp. /// /// \param [in] input - value to broadcast. /// \param [in] src_lane - id of the thread whose value should be broadcasted /// \param [in] storage - reference to a temporary storage object of type storage_type. /// /// \par Storage reusage /// Synchronization barrier should be placed before \p storage is reused /// or repurposed: \p __syncthreads() or \p rocprim::syncthreads(). template ROCPRIM_DEVICE ROCPRIM_INLINE auto broadcast(T input, const unsigned int src_lane, storage_type& storage) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), T>::type { return base_type::broadcast(input, src_lane, storage); } /// \brief Broadcasts value from one thread to all threads in logical warp. /// Invalid Warp Size template ROCPRIM_DEVICE ROCPRIM_INLINE auto broadcast(T , const unsigned int , storage_type& ) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), T>::type { ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort."); return T(); } #ifndef DOXYGEN_SHOULD_SKIP_THIS protected: template ROCPRIM_DEVICE ROCPRIM_INLINE auto to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage) -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type { return base_type::to_exclusive(inclusive_input, exclusive_output, storage); } template ROCPRIM_DEVICE ROCPRIM_INLINE auto to_exclusive(T , T& , storage_type&) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type { ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort."); return; } #endif }; END_ROCPRIM_NAMESPACE /// @} // end of group warpmodule #endif // ROCPRIM_WARP_WARP_SCAN_HPP_